In [1]:
import pandas as pd
import numpy as np
import gc
import seaborn as sns
import dask as dd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot
from scipy.stats.stats import pearsonr 

In [2]:
train = pd.read_parquet('../input/ubiquant-parquet/train.parquet')
#train_dask = dd.read_parquet('../input/ubiquant-parquet/train.parquet')

In [3]:
train.info()

In [4]:
train.head()

## Check distribution of time sampling per investment ID

In [5]:
# pivot the investment IDs and target for counts

table = pd.pivot_table(train, values='target', index = "investment_id", aggfunc = {"target": "count"})
table.head()

In [6]:
# check the distribution of time samples
sns.displot(table.target)

In [7]:
table = pd.pivot_table(train, values='target', index = "investment_id", columns = ["time_id"])
table.head()

In [8]:
# check the null values

fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(table.isnull(), cbar=False)

In [9]:
table = table.reset_index()

In [10]:
y = table.iloc[:,1:].values
y

## Clustering using Kmeans

In [11]:
# interpolate target missing values using KNN

imputer = KNNImputer(n_neighbors=2, weights="uniform")
y = imputer.fit_transform(y)

In [12]:
# Check optimum number of clusters using elbow method

sse = {}
for k in range(1, 25):
    kmeans = KMeans(n_clusters=k, random_state=1)
    scaler = StandardScaler()
    y_std = scaler.fit_transform(y)
    kmeans.fit(y_std)
    sse[k] = kmeans.inertia_
    
plt.title('Elbow plot for K selection')
plt.xlabel('k')
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()),
                 y=list(sse.values()))
plt.show()

In [13]:
# define the model
model = KMeans(n_clusters=10)
# fit the model
kmeans = KMeans(n_clusters=k, random_state=1)
scaler = StandardScaler()
y_std = scaler.fit_transform(y)
model.fit(y_std)
# assign a cluster to each example
yhat = model.predict(y_std)
# retrieve unique clusters
clusters = unique(yhat)

In [14]:
#create clustered dataframe

clustered_df = pd.DataFrame({"id": table.investment_id.values, "cluster": yhat, "target": y.tolist()})
clustered_df.to_parquet("clustered.parquet")
clustered_df.head()

## check correlation of targets in clusters

In [15]:
# filter a cluster
cluster = 5
cluster_filt = clustered_df[clustered_df["cluster"] == cluster].copy()

#sample two random investment IDs
cluster_filt = cluster_filt.sample(n=2, axis = 0, replace = True)
cluster_filt.head()

In [16]:
# calculate correlation of the two targets

signal_1 = np.asarray(cluster_filt.target.iloc[0])
signal_2 =  np.asarray(cluster_filt.target.iloc[1])
corr = pearsonr(signal_1,signal_2)
print(corr)


In [17]:
fig, ax = plt.subplots(2,1)
for i in range(0,2):
    time = [*range(0,len(cluster_filt.iloc[i,2]))]
    sns.lineplot(x=time, y = cluster_filt.iloc[i,2], ax = ax[i])

### check distribution of correlation coefficients in each cluster

In [18]:
def get_correlations(cluster):
    
    cluster_filt = clustered_df[clustered_df["cluster"] == cluster].copy()

    #sample two random investment IDs
    cluster_filt = cluster_filt.sample(n=2, axis = 0)
    signal_1 = np.asarray(cluster_filt.target.iloc[0])
    signal_2 =  np.asarray(cluster_filt.target.iloc[1])
    corr = pearsonr(signal_1,signal_2)
    
    return corr[0]
    

In [19]:
def plot_cluster_corr(cluster,repeat_times):
    
    correlations=[]
    for i in range(0,repeat_times):     
        corr = get_correlations(cluster)
        correlations.append(corr)
    #fig, ax = plt.subplots(figsize=(5,5))
    ax = sns.displot(x=correlations)
    plt.title("cluster_"+ str(cluster))
    return ax

In [20]:
ax = plot_cluster_corr(0,500)

In [21]:
cluster_list = list(clustered_df.cluster.unique())
cluster_list.sort()
#fig, ax = plt.subplots(5,2, figsize = (10,10))
for i,cluster in enumerate(cluster_list):
    plot_cluster_corr(cluster,500)
    