In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
import os
import shutil
import csv
import sys

### Functions

In [2]:
def plot_curves(sample_ls, df1):
    colnames = np.array(df1.columns)
    plt.figure(figsize= [15,15])
    for item in sample_ls:
        npar = np.array(df1.loc[item])
        plt.plot(colnames, npar, label=item)
    plt.legend()
    plt.xticks(rotation=90)
    plt.show()

### Setting up directory

In [3]:
mode = 'competition' # either competition or neut
selection = False
if mode == 'competition':
    infile = '/Users/gl2411/Dropbox (EinsteinMed)/Elisa_platform/megan_pcdh1_gngc/10_27_21/compet_elisa_processed_wt4norm/all_predicted_readouts.csv'
    outdir = 'cluster_competitionElisa_wt4norm'
elif mode == 'neut':
    infile = '/Users/gl2411/Dropbox (EinsteinMed)/Elisa_platform/megan_pcdh1_gngc/10_27_21/neut_processed_wt4norm/all_predicted_readouts.csv'
    outdir = 'cluster_neut_wt4norm'    
else:
    sys.exit()
if os.path.exists(outdir):
    shutil.rmtree(outdir)
os.mkdir(outdir)

### Loading and Processing input 

In [4]:
everyxcol = 20
df = pd.read_csv(infile)
df = df.replace('LONG_','', regex=True)
df = df.set_index('sample_name')
df.shape
df_short = df[df.columns[::everyxcol]]
ymax = df_short.max().max() + 10
outpngfile = outdir + '/clustering_every' + str(everyxcol) + '.png'

In [5]:
if selection == True:
    # Making a selection of rows
    sele_ls = ['F83L', 'F83A', 'D85A', 'D85R', 'V86A' , 'F83A-D85A-V86A',
               'D102A', 'D102R', 'K104A', 'K104E', 'T105A', 'D102A-K104A-T105A',
               'I140A', 'T141A', 'D142A', 'D142R', 'L143A', 'V144A', 'Q145A', 
               'I140A-T141A-D142A', 'L143A-V144A-Q145A', 'WT']
    df_temp = pd.DataFrame()
    for item in sele_ls:
        df2 = df_short.loc[[item]]
        df_temp = df_temp.append(df2, ignore_index=False)
    df_short.iloc[0,0]
    df_short = df_temp.copy()
display(df_short)

Unnamed: 0_level_0,-3.12000,-2.73608,-2.35216,-1.96824,-1.58432,-1.20040,-0.81648,-0.43256,-0.04864,0.33528
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A78R,93.3168,93.25232,93.06124,92.49861,90.87368,86.43041,75.89913,57.60303,38.13657,25.7591
A79S,92.29147,92.20232,91.88318,90.75539,86.94349,75.77808,53.23658,29.41307,16.58001,12.02672
D102A,97.18564,96.13713,94.1174,90.33596,83.61905,72.7328,57.46423,39.8846,23.70764,11.61636
D102R,88.12319,88.12198,88.11604,88.08712,87.94628,87.26609,84.10436,71.66499,43.10962,18.26842
D107R,86.18221,86.17252,86.13801,86.01541,85.58232,84.08246,79.22512,66.38063,45.0746,27.27539
D142A,87.31854,87.25658,87.08258,86.59657,85.25867,81.71879,73.25792,57.15284,36.53263,20.267
D142R,88.07775,88.0546,87.9789,87.73195,86.93346,84.42272,77.17399,60.52902,36.50289,17.78517
D80A,90.91268,90.91251,90.91188,90.90945,90.90009,90.86415,90.72647,90.20432,88.29762,82.19399
D85A,111.06938,111.03573,110.93167,110.61096,109.63326,106.74949,99.00921,82.592,60.31479,43.17885
D85R,100.46744,100.45676,100.42533,100.33298,100.06272,99.28147,97.10051,91.56127,80.3225,65.23015


### Clustering

In [6]:
%%capture
sns.set(font_scale=1.4)
g = sns.clustermap(df_short, col_cluster=False, figsize=(10,35), cmap='vlag', \
                   cbar_pos=(0.06, 0.8, 0.05, 0.05), dendrogram_ratio=0.4,
                  method='average')
for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(3)
g.savefig(outpngfile, dpi=300)
# Getting tree
Z = g.dendrogram_row.linkage
hierarchy.dendrogram(Z)

### Cut tree at multiples points, save clustered curves

In [7]:
# Create directories
for maxclusct in range(2,11):
    clusterdir = outdir + '/clusters_' + str(maxclusct)
    if not os.path.exists(clusterdir):
        os.mkdir(clusterdir)

In [8]:
# ymax = df_short.max().max() + 0
for maxclusct in range(2,11):
    print('Cutting tree into' , maxclusct, 'clusters')
    clusterdir = outdir + '/clusters_' + str(maxclusct)
    clusters = hierarchy.fcluster(Z, maxclusct, criterion='maxclust')
    cluster_st = set(clusters)
    label_dc = {}
    for clusterid in cluster_st:
    #     print(clusterid)
        target = []
        for indx, clsid in enumerate(clusters):
            if clsid == clusterid:
                target.append(indx)
        df_x = df_short.iloc[target]
        x_val = np.array(df_x.columns).astype(np.float)
        plt.figure(figsize=[10,5])
        for index, row in df_x.iterrows():
            y_val = np.array(df_short.loc[index]).astype(np.float)
            labelnm = df.loc[index].name
            label_dc[labelnm] = clusterid
            plt.plot(x_val, y_val, label=labelnm)
        plt.legend(fontsize=3.)
        plt.yticks(fontsize=10.)
        plt.ylim(0,ymax)
        plt.xticks(rotation=90, fontsize=10.)
        plt.xticks(np.arange(np.min(x_val), np.max(x_val)+0.6, 0.2))
        plt.savefig(clusterdir + '/clusterid_'+ str(clusterid) + '.png',dpi=300)
        plt.close()

    # Save dictionary
    f = open(clusterdir + '/cluster_assignment.csv', 'w')
    writer = csv.writer(f)
    for key, val in label_dc.items():
        writer.writerow([key, val])
    f.close()

Cutting tree into 2 clusters
Cutting tree into 3 clusters
Cutting tree into 4 clusters
Cutting tree into 5 clusters
Cutting tree into 6 clusters
Cutting tree into 7 clusters
Cutting tree into 8 clusters
Cutting tree into 9 clusters
Cutting tree into 10 clusters


In [9]:
# ls = ['WT', 'D85R' , 'Y62A']
# plot_curves(ls, df)

In [10]:
# display(df_short)

In [11]:
# df_short