In [1]:
#!pip install streamlit
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LinearRegression
import xlsxwriter
# import matplotlib.pyplot as plt

# Clustering
from sklearn.cluster import KMeans
from k_means_constrained import KMeansConstrained
# conda install -c conda-forge scikit-learn-extra
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from operator import itemgetter
# !pip install pyclustering
from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import type_metric, distance_metric
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from scipy.spatial import distance_matrix
from scipy.spatial import distance

# Principal Components Analysis
from scipy import stats
# from sklearn.decomposition import PCA

# Classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import itertools

In [88]:
pip list

Package                            Version
---------------------------------- -------------------
-atplotlib                         3.3.2
-umpy                              1.18.1
alabaster                          0.7.12
altair                             4.1.0
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
argh                               0.26.2
asn1crypto                         1.4.0
astor                              0.8.1
astroid                            2.3.3
astropy                            4.0
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              20.3.0
autopep8                           1.4.4
Babel                              2.9.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.1
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.po

## Import Data

In [2]:
# Import Data
df = pd.read_csv('dgn_raw_data.csv')

# Add very small random number to Rating
df['target']=df['Rating'].apply(lambda x: x+random.random()/1000)

## Regressions for Each UID

In [3]:
# Unique IDs
ids = df.UID.unique()

# Run linear regressions for each UID
op = pd.DataFrame
intercept = []
coefficients=[]
UID = []
for p in ids:
    df_i = df[df.UID == p]              # Create dataframe for current user id
    X = df_i.filter(regex='^[a-zA-Z][0-9]')  # df input variables only
    y = df_i['target']                  # Series of target variable
    reg = LinearRegression().fit(X, y)  # Fit linear regression
    reg.score(X, y)                     # Score regression model
    unique_id=df_i['UID'].unique()      # Saves current user id
    const = reg.intercept_              # Save intercept of the regression model
    coef = reg.coef_                    # Coefficients of regression model
    UID.append(unique_id)               # Append current user id
    intercept.append(const)             # Append current intercept
    coefficients.append(coef)           # Append current regression coefficients

# Convert newly created lists into dataframes
intercep_new = pd.DataFrame(intercept)
coefficients_new = pd.DataFrame(coefficients)
UID_new = pd.DataFrame(UID)

# Get columns names
colNames = df.drop(['Rating', 'target',], axis=1).columns
colNames = colNames.insert(1, 'Const')
colNames

# Concatenate the new dataframes and add column names
op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
op.columns = colNames

# Save only regression coefficients for clustering
scores = op.drop(['UID','Const'], axis=1)

## Clustering
### Define Pearson distance

In [4]:
def pearson_dist(x, y):
    r = stats.pearsonr(x, y)[0]
    return (1 - r) / 2

### Run clustering algorithm

In [85]:
# Holds only final cluster solutions
cluster_solutions = {}

max_clusters = 6

for n in range(2, max_clusters+1):

    # change your df to numpy arr
    sample = scores.to_numpy()
    
    # define a custom metric
    metric = distance_metric(type_metric.USER_DEFINED, func=pearson_dist)
    
    # carry out a km++ init
    initial_centers = kmeans_plusplus_initializer(sample, n, random_state=123).initialize()
    
    # execute kmeans
    kmeans_instance = kmeans(sample, initial_centers, metric=metric)
    
    # run cluster analysis
    kmeans_instance.process()
    
    # get clusters
    clusters = kmeans_instance.get_clusters()
    
    # Empty dataframe to take in cluster assignments for each loop iteration
    df_clusters = pd.DataFrame()

    for i in range(len(clusters)):
        df = scores.iloc[clusters[i],:]
        df[f'Optimal {n} cluster solution'] = i+1
        df_clusters = pd.concat([df_clusters, df])
        df_clusters.sort_index(inplace=True)
    
    cluster_solutions[f'Optimal {n} cluster solution'] = df_clusters.iloc[:, -1]

all_cluster_solutions = pd.DataFrame.from_dict(cluster_solutions)

op.merge(all_cluster_solutions, left_index=True, right_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,UID,Const,A1,A2,A3,A4,B1,B2,B3,B4,...,C4,D1,D2,D3,D4,Optimal 2 cluster solution,Optimal 3 cluster solution,Optimal 4 cluster solution,Optimal 5 cluster solution,Optimal 6 cluster solution
0,1,65.557741,21.751974,-12.870580,0.538782,14.688788,44.407300,4.668831,-2.318513,-21.845784,...,-74.990165,26.500109,41.033802,51.223571,11.255220,1,3,1,1,1
1,2,-59.461057,86.039887,90.017772,74.390597,66.396992,1.826372,-33.026053,3.179157,-15.475371,...,-10.818958,89.524654,83.429717,48.009145,93.441310,1,3,1,5,5
2,3,91.635535,67.702956,64.170128,15.395295,18.859847,-4.501879,4.677590,-57.286793,-20.365885,...,-38.873976,6.687625,-59.897133,-52.237106,-31.524829,1,3,3,3,3
3,4,168.804133,-3.414603,-7.568203,-37.699630,-8.989176,-13.425496,-79.309678,-37.854229,-46.699190,...,-90.750351,-20.307109,1.256557,-15.608342,-32.250747,1,3,1,1,3
4,5,105.171297,-11.427229,-77.015043,-54.392268,-72.349222,21.059732,73.217286,19.827909,18.753820,...,-1.291739,-27.985926,-42.094441,4.237874,-19.125232,1,1,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,4.931101,0.050266,0.299190,0.223678,0.237588,-0.200014,0.059077,-0.039804,0.060648,...,-0.299018,0.193496,0.146346,0.128356,-0.094110,1,3,1,1,3
96,97,72.097056,-47.592419,4.905752,41.866526,-60.819727,8.596508,13.827413,-2.340060,-5.265107,...,-8.551527,-15.584257,-30.384824,-42.932040,24.027697,1,1,4,4,4
97,98,19.436797,-4.515443,-0.899396,-5.661579,18.843842,-21.624428,-30.682187,-30.613399,-24.686415,...,12.317440,-11.675852,-8.936092,-4.114002,3.448422,2,2,2,2,2
98,99,103.352616,-35.323637,-51.918486,-40.471420,17.442710,-59.719476,-58.339683,-94.677638,-41.663237,...,83.442132,5.997733,-49.286283,-57.235706,-37.030838,2,2,2,2,2


### Add or remove cluster assignments (this is manual for testing)

In [43]:
%%capture
# the %%capture magic suppresses all cell output (want to block a warning)

# Quick and dirty way to get cluster assignments in order
df1 = scores.iloc[clusters[0],:]
df1[f' Optimal {n} cluster solution'] = 1
df2 = scores.iloc[clusters[1],:]
df2['cluster'] = 2
df3 = scores.iloc[clusters[2],:]
df3['cluster'] = 3
df4 = scores.iloc[clusters[3],:]
df4['cluster'] = 4
df5 = scores.iloc[clusters[4],:]
df5['cluster'] = 5
df6 = scores.iloc[clusters[5],:]
df6['cluster'] = 6
df7 = pd.concat([df1,
                 df2,
                 df3,
                 df4,
                 df5,
                 df6
                ])
df7.sort_index(inplace=True)

In [36]:
df7.head()

Unnamed: 0,A1,A2,A3,A4,B1,B2,B3,B4,C1,C2,C3,C4,D1,D2,D3,D4,cluster
0,21.751701,-12.87055,0.538552,14.688498,44.407587,4.669135,-2.31856,-21.845627,-1.531246,-22.509139,-57.12481,-74.990276,26.500765,41.033842,51.223837,11.255346,1
1,86.040724,90.018303,74.391032,66.397246,1.826236,-33.025685,3.179111,-15.475434,-7.107471,19.45047,-44.596385,-10.818451,89.524527,83.429168,48.009009,93.441485,5
2,67.703797,64.170636,15.395659,18.860609,-4.501475,4.677963,-57.286223,-20.365638,27.851811,-50.188605,-59.49099,-38.873982,6.688051,-59.897008,-52.237145,-31.524701,3
3,-3.414774,-7.568285,-37.699731,-8.989124,-13.425445,-79.308952,-37.85395,-46.698618,-72.714769,-25.546261,-93.735383,-90.750163,-20.307151,1.256502,-15.608468,-32.250583,3
4,-11.427247,-77.014917,-54.392476,-72.349339,21.059173,73.217016,19.827406,18.753339,5.367129,-83.64953,-32.425278,-1.291061,-27.986172,-42.09426,4.237379,-19.125068,4
5,-66.469395,76.287657,-19.908992,13.399164,-94.324425,-53.42648,-58.926241,-11.312814,-117.962528,-36.644122,-121.167616,-41.603151,2.065873,23.886621,-29.463709,-23.558054,5
6,56.304838,109.304246,71.40893,95.373036,94.96104,95.898578,45.689466,70.750382,-45.140882,-26.413441,37.647949,14.881531,63.037723,89.801978,39.553371,108.352111,1
7,103.480972,55.04426,84.412996,124.128883,57.783096,-10.816572,70.37954,86.722506,62.680024,62.826412,59.127439,91.514027,-12.18166,-19.314715,-46.14172,-54.488157,6
8,10.42297,54.939178,39.763331,13.097985,-22.678286,-41.94821,40.918155,-0.009732,83.274645,7.563944,30.345781,50.848388,22.252332,-62.461084,12.962583,-31.969742,6
9,0.903521,-40.705518,-36.933576,-22.913264,-40.261687,-35.132836,-75.700864,-53.292261,-45.85203,22.132751,-48.61694,-14.569994,26.592877,38.94078,43.088716,-0.460904,5


In [37]:
corr_distances = pd.DataFrame(distance.cdist(scores, scores, 'correlation'))

In [38]:
silhouette_score(corr_distances, df7['cluster'], metric='precomputed')

0.14479592098584712

In [81]:
silhouette_score(corr_mat, df7['cluster'], metric='precomputed')

0.14479592098584715

In [76]:
corr_mat = 1-scores.T.corr()