In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt, log10, ceil, pi
from copy import deepcopy

### Loading data from pickle file

In [None]:
df_out = pd.read_pickle('../results/organizations_profiles.bin')

### Pre analysis filters

* Filter out banks without any transfers or with less then 2 members: functional banks
* Replace Null values with mean values or zeroes for functional banks (depending on the variable)
* Filter out inactive banks

In [None]:
# convert transfers from seconds to minutes
df_out['amount_tot'] = -1*df_out['amount_tot']/3600
df_out['amount_per_member'] = -1*df_out['amount_per_member']/3600

In [None]:
df_functional = df_out[(df_out.n_transf_tot > 0.0)]
df_functional = df_functional[(df_functional.n_members>1)]
df_active = df_functional[(df_functional.pc_active > 0.0)]


df_func_clean=df_functional

pd.set_option('use_inf_as_na', True)

df_func_clean['avg_delay'].fillna((df_functional['avg_delay'].mean()), inplace=True)
df_func_clean['avg_seniority'].fillna((df_functional['avg_seniority'].mean()), inplace=True)
df_func_clean['avg_age'].fillna((df_functional['avg_age'].mean()), inplace=True)

df_func_clean['bank_age_months'].fillna(-999, inplace=True)
df_func_clean['threshold'].fillna(-999, inplace=True)

df_func_zeroes = df_func_clean.fillna(0)

In [None]:
variables_desired = ['n_members',
                       'n_active_members',
                       'avg_delay',
                       'n_transf_tot',
                       #'amount_tot',
                       'ntransf_per_member',
                       'amount_per_member',
                       'pc_active',
                       'avg_seniority',
                       'density',
                       'median_centrality',
                       'pc_inert',
                       'npost_per_member']                    
                      #'n_popular_members']

In [None]:
def plot_histograms(df, variables_histogram, color='#298ba1', alpha=1.0):
    rows = ceil(len(variables_histogram)/3)
    fig, ax = plt.subplots(rows, 3, figsize=(14, ceil(3.5*rows)))
    ax = ax.ravel() 
    for i, key in enumerate(variables_histogram):
        ax[i].hist(df[key], bins=20, color=color, alpha=alpha)
        ax[i].set_title(key)
        
def plot_histogram_comparisons(dfs, variables_histogram, colors, alpha=0.7):
    df1, df2 = dfs
    rows = ceil(len(variables_histogram)/3)
    fig, ax = plt.subplots(rows, 3, figsize=(14, ceil(3.5*rows)))
    ax = ax.ravel()
    for i, key in enumerate(variables_histogram):
        range = [min(df1[key].min(),df2[key].min()),max(df1[key].max(),df2[key].max())]
        ax[i].hist(df1[key], bins=20, range=range, color=colors[0], alpha=alpha, density=True)
        ax[i].hist(df2[key], bins=20, range=range, color=colors[1], alpha=alpha, density=True)
        ax[i].set_title(key)
    fig.savefig('histogram_compare_%i_%i'%(clusters[0],clusters[1]))

In [None]:
def abslog(x):
    if x > 1:
        return log10(x)
    elif x < -1:
        return -log10(-x)
    else:
        return 0

In [None]:
log_normal_vars = ['n_members',
                   'n_active_members',
                   'n_transf_tot',
                   'amount_tot',
                   'ntransf_per_member',
                   'amount_per_member',
                   'avg_seniority']
to_sqrt_vars = ['avg_delay', 'pc_active', 'pc_inert', 'npost_per_member']
rest_vars = list(set(variables_desired).difference(set(log_normal_vars+to_sqrt_vars)))
sqrt_vars = ['sqrt_'+v for v in to_sqrt_vars]
abslog_vars = ['abslog_'+v for v in log_normal_vars]
mixed_vars = sqrt_vars + abslog_vars + rest_vars

In [None]:
df_mixed = pd.DataFrame()

for var in log_normal_vars:
    print(var)
    df_mixed['abslog_'+var] = df_func_zeroes[var].apply(lambda x: abslog(x)) # this is wrong

for var in to_sqrt_vars:
    print(var)
    df_mixed['sqrt_'+var] = df_func_zeroes[var].apply(lambda x: sqrt(abs(x))) # this is wrong
    
for var in rest_vars:
    df_mixed[var] = df_func_zeroes[var]

In [None]:
plot_histograms(df_mixed, mixed_vars)

## Clustering banks

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

First use PCA to reduce dimensionality to 3

In [None]:
def use_pca(df, dim=3):
    X = df_mixed.values
    pca = PCA(n_components=dim, svd_solver='full')
    pca.fit(X)
    X_trans = pca.transform(X)

    fig, ax = plt.subplots(dim, 1, figsize=(7, 12))
    ax = ax.ravel() 

    ax[0].scatter(X_trans[:, 0], X_trans[:, 1])
    ax[1].scatter(X_trans[:, 1], X_trans[:, 2])
    ax[2].scatter(X_trans[:, 0], X_trans[:, 2])
    return X_trans

def cluster_three_d(X, n_clusters=3):
    y_pred = KMeans(n_clusters=n_clusters).fit_predict(X)
    fig, ax = plt.subplots(3, 1, figsize=(7, 12))
    ax = ax.ravel() 

    ax[0].scatter(X[:, 0], X[:, 1], c=y_pred)
    ax[1].scatter(X[:, 1], X[:, 2], c=y_pred)
    ax[2].scatter(X[:, 0], X[:, 2], c=y_pred)
    return y_pred

In [None]:
X_new = use_pca(df_mixed)

## Use Kmeans to cluster
First for 4 clusters, then for 3 clusters

In [None]:
y_pred_4clusters = cluster_three_d(X_new, n_clusters=4)

In [None]:
y_pred_3clusters = cluster_three_d(X_new, n_clusters=3)

## Checking the resulting clusters
First for 3 clusters

In [None]:
df_mixed['cluster'] = y_pred_3clusters
df_mixed.groupby("cluster").agg(['mean','median'])

In [None]:
df_check = df_func_zeroes[variables_desired]

df_check['cluster'] = y_pred_3clusters
df_check.groupby("cluster").agg(['mean','median','std']).T

Then for 4 clusters

In [None]:
df_check['cluster'] = y_pred_4clusters
df_check.groupby("cluster").agg(['mean','median','std']).T

Normalize the values to be between 0 and 1, depending on their min max

In [None]:
df_check = df_func_zeroes[variables_desired]
df_check['cluster'] = y_pred_3clusters

df_normalized = pd.DataFrame()
for v in df_check.columns.to_list():
    if v != 'cluster':
        if v in log_normal_vars+['avg_delay']:
            df_normalized[v] = df_check[v].apply(lambda x: log10(1+x))
            df_normalized[v] = (df_normalized[v]-df_normalized[v].min())/(df_normalized[v].max()-df_normalized[v].min())
        else:
            df_normalized[v] = (df_check[v]-df_check[v].min())/(df_check[v].max()-df_check[v].min())

df_normalized['cluster'] = y_pred_3clusters

In [None]:
means = df_normalized.groupby("cluster").median()
df = means.reset_index()
df

Draw the means in a radar graph

In [None]:
def make_spider(row, no_cluster, title, color):
 
    # number of variable
    categories=list(df)[1:]
    N = len(categories)

    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    # Initialise the spider plot
    ax = plt.subplot(1,no_cluster,row+1, polar=True, )

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories, color='black', size=12)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0,0.25,0.75], ["0","0.25","0.75"], color="grey", size=9)
    plt.ylim(0,0.75)

    # Ind1
    values=df.loc[row].drop('cluster').values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
    ax.fill(angles, values, color=color, alpha=0.4)

    # Add a title
    plt.title(title, size=15, color=color, y=1.1)

In [None]:
means = df_normalized.groupby("cluster").mean()
df = means.reset_index()    

# ------- PART 2: Apply to all individuals
# initialize the figure
#my_dpi=96
#plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)
plt.figure(figsize=(20,10))

# Create a color palette:
my_palette = plt.cm.get_cmap("Set2", len(df.index))
 
# Loop to plot
for row in range(0, len(df.index)):
    make_spider(row=row, no_cluster=4, title='group '+str(df['cluster'][row]), color=my_palette(row))
plt.subplots_adjust(wspace=0.4)
plt.savefig('cluster_means.png')

In [None]:
means = df_normalized.groupby("cluster").median()
df = means.reset_index()    

# ------- PART 2: Apply to all individuals
# initialize the figure
#my_dpi=96
#plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)
plt.figure(figsize=(20,10))

# Create a color palette:
my_palette = plt.cm.get_cmap("Set2", len(df.index))
 
# Loop to plot
for row in range(0, len(df.index)):
    make_spider(row=row, no_cluster=4, title='group '+str(df['cluster'][row]), color=my_palette(row))
plt.subplots_adjust(wspace=0.4)
plt.savefig('cluster_media.png')

In [None]:
df_functional['cluster'] = y_pred_3clusters
# save to file
with open('functional_kmeans_3clusters.bin', 'wb') as out:
    df_functional.to_pickle(out)
with open('spiderplot_3clusters.bin', 'wb') as out:
    df_check.to_pickle(out)

In [None]:
clusters = [1, 2]
dfs = [df_functional[df_functional['cluster'] == cluster] for cluster in clusters]
plot_histogram_comparisons(dfs, variables_desired, [my_palette(cluster) for cluster in clusters], alpha=0.8)

In [None]:
clusters = [0, 2]
dfs = [df_functional[df_functional['cluster'] == cluster] for cluster in clusters]
plot_histogram_comparisons(dfs, variables_desired, [my_palette(cluster) for cluster in clusters], alpha=0.8)

In [None]:
def get_typical(df_functional, cluster, variables_desired):
    df_cluster = df_functional[df_functional['cluster'] == cluster]
    for key in variables_desired:
        median = df_cluster[key].median()
        mean = df_cluster[key].mean()
        std = df_cluster[key].std()
        df_cluster = df_cluster[(df_out[key] <= mean+std)]
        df_cluster = df_cluster[(df_out[key] >= mean-std)]
    return df_cluster[['bank_name']+variables_desired+['avg_age']].T

In [None]:
# get typical for cluster 2
get_typical(df_functional, 2, variables_desired)

In [None]:
# get aytypical for cluster 2
cluster = 2
df_cluster = df_functional[df_functional['cluster'] == cluster]
df_cluster = df_cluster[(df_out['n_members'] <= 10)]
df_cluster = df_cluster[(df_out['pc_active'] == 0)]
df_cluster = df_cluster[(df_out['n_transf_tot'] < 5)]

df_cluster[['bank_name']+variables_desired+['avg_age']].T

In [None]:
# get typical for cluster 1
get_typical(df_functional, 1, variables_desired)

In [None]:
# get atypical for cluster 1
cluster = 1
df_cluster = df_functional[df_functional['cluster'] == cluster]
df = df_cluster[(df_cluster['pc_active'] != 0)]
df2 = pd.concat([df,df_cluster[(df_out['bank_id'] == 148)]])

df2[['bank_name']+variables_desired+['avg_age']].T

In [None]:
# get typical for cluster 0
get_typical(df_functional, 0, variables_desired)

In [None]:
# get atypical for cluster 0
cluster = 0
df_cluster = df_functional[df_functional['cluster'] == cluster]
df_cluster = df_cluster[(df_out['pc_inert'] <= 10)]
df_cluster[['bank_name']+variables_desired].T


In [None]:
# get particular for cluster 0
cluster = 0
df_cluster = df_functional[df_functional['cluster'] == cluster]
df_cluster = df_cluster[(df_out['n_members'] > 200)]
df_cluster[['bank_name']+variables_desired].T
