# Cluster morphological profiles from the training set 

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import figure_2 as f2
from scipy import stats
from scipy.spatial import distance

import cpmolgan.utils as utils
import cpmolgan.nearest_neighbors as nn
import cpmolgan.visualization as vi


## Inputs


In [2]:
args = {
    'filename_train_profiles':"../../data/train_set_30kcpds_normalized_profiles.csv.gz",
    'output_filename_cluster_profiles':"results/train_set_cluster_profiles_median_per_smiles.csv",
    'ouput_filename_model':'results/Kmeans_model.pkl',

}


### 1. Read data

In [3]:
train = pd.read_csv(args['filename_train_profiles'],index_col=0)
train = train.rename(columns={"Metadata_Plate":'Plate',"Metadata_Well":"Well"})
feature_cols, _ = utils.get_feature_cols(train)
dmso_idx = train["Metadata_broad_sample"]=="DMSO"
data_dmso = train.loc[dmso_idx].reset_index(drop=True)
data_cpd = train.loc[dmso_idx==False].dropna(axis=0,how="any")
data_cpd = data_cpd.sort_values(by=["SMILES_standard","Plate","Well"]).reset_index(drop=True)
unique_smiles = pd.Series(data_cpd.SMILES_standard.unique())
print("No compound samples: %i"% len(data_cpd))
print("No DMSO samples: %i"% len(data_dmso))

No compound samples: 126779
No DMSO samples: 26572


### 2. Calculate median profile per SMILES_standard

In [4]:
data_cpd_med = data_cpd.groupby(by="SMILES_standard").median().reset_index()
data_cpd_med = data_cpd_med.drop(columns={"Plate"})
print("No of compound median profiles: %i"%len(data_cpd_med))

No of compound median profiles: 15413


### 3. Enrich compound set with 1% DMSO samples

In [5]:
N = int(0.01*len(data_cpd_med))
dmso_subsample = data_dmso.sample(N, random_state=10)
data_cpd_med["label"]= "Cpd"
dmso_subsample["label"]="DMSO"
data = pd.concat( [data_cpd_med, dmso_subsample[data_cpd_med.columns]] ).reset_index(drop=True)

### 4. Fit and save Kmeans model

In [6]:
Nclusters = 20
metric = "euclidean" 

# Fit model
model = KMeans(n_clusters=Nclusters, random_state=0)
features = data[feature_cols].values
model.fit(features)

# Sort centroids by increasing distance to control cluster (0)
clusters = model.predict( features )            
ctrl_cluster = stats.mode( clusters[ data["label"]=="DMSO" ] )[0][0]
model = f2.sort_centroids( model, ctrl_cluster , metric=metric)

# Save results
pickle.dump(model, open(args['ouput_filename_model'], "wb"))

### 5. Run model and save median profiles with annotated clusters

In [7]:
data['cluster'] = model.predict( data[feature_cols].values )
data["cluster"] = data["cluster"].apply(lambda x: "Cluster"+str(x))
ordered_cols = list(set(data.columns).difference(feature_cols)) + feature_cols
data[ordered_cols].to_csv(args['output_filename_cluster_profiles'])
print("Saved file %s"%args['output_filename_cluster_profiles'])

Saved file results/train_set_cluster_profiles_median_per_smiles.csv
