# Performing Dimensionality Reduction

### Code

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import sklearn.cluster as cluster
import numpy as np
import math

import matplotlib.cm as cm
import matplotlib
from matplotlib.widgets import Cursor
import mplcursors
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import matplotlib.patches as mpatches

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram

from sklearn.multioutput import MultiOutputRegressor

LKB = pd.read_excel (r'DR_Comparison.xlsx',
                    sheet_name='LKBPP screen dataset')

descriptors = ['homo', 'lumo', 'pa', 'homo2', 'lumo2', 'pa2', 'he.w.pn', 'nhe', 'be.gla', 'ml.gla', 'dmd.gla', 'd.d1rgla', 'd.d2rgla', 'drd1rgla', 
               'drd2rgla', 'nbo.glaf', 'be.pd', 'ml.pd', 'dmd.pd', 'd.d1r.pd', 'd.d2r.pd', 'd.rd1rpd', 'd.rd2rpd', 'nbo.pd.f', 'dppd.d1', 'dppd.d2', 
               'dpzn.d1', 'dpzn.d2']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Scale the data
scaler = StandardScaler()
data_std = scaler.fit_transform(LKB[descriptors])

In [3]:
#Definition of each dimensionality reduction technique (it take a while to run because of tsne)
pca = PCA(n_components=7).fit_transform(data_std)
pca_df_scale = pd.DataFrame(pca, columns=['PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6', 'PC7'])

tsne = TSNE(n_components=10, verbose=0, perplexity=12, n_iter=5000, learning_rate="auto", method='exact', random_state=42).fit_transform(data_std)
tsne_df_scale = pd.DataFrame(tsne, columns=['tsne1', 'tsne2', 'tsne3','tsne4', 'tsne5', 'tsne6', 'tsne7', 'tsne8', 'tsne9', 'tsne10'])

umap = umap.UMAP(n_neighbors=15,min_dist=0.5,n_components=10,random_state=42).fit_transform(data_std)
umap_df_scale = pd.DataFrame(umap, columns=['umap1', 'umap2', 'umap3', 'umap4', 'umap5', 'umap6', 'umap7', 'umap8', 'umap9', 'umap10'])

DRs = {'PCA':{'DRdf': pca_df_scale}, 'tSNE':{'DRdf': tsne_df_scale},
       'UMAP':{'DRdf': umap_df_scale}}

DRs['PCA']['DRdf']['PC1'] = DRs['PCA']['DRdf']['PC1']*-1



In [4]:
pca_test = PCA(n_components=7)
pca_test.fit_transform(data_std)
pca_test.explained_variance_ #ratio_
df = pd.DataFrame(pca_test.components_, 
              index=['PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6', 'PC7'], 
              columns=descriptors)

PCs_to_lookat = ['PC1', 'PC2', 'PC3']
descr_list = []
for PC in PCs_to_lookat:
    pc_descr = abs(df).T.nlargest(3, PC).index.to_list()
    descr_list = descr_list + pc_descr

df[[c for c in df.columns if c in descr_list]].T[PCs_to_lookat].reindex(descr_list)

Unnamed: 0,PC1,PC2,PC3
nbo.pd.f,0.331987,-0.01314,0.000458
be.pd,-0.289376,-0.022994,0.018926
d.d2r.pd,0.252294,0.096756,0.269805
nhe,0.14462,0.290118,0.070537
dmd.gla,0.136769,0.278354,-0.107875
dppd.d1,0.179927,0.27686,-0.038435
drd1rgla,-0.042124,-0.028222,0.372106
d.d2rgla,0.198004,0.034708,0.333314
d.rd1rpd,-0.083706,-0.206554,0.326908


In [5]:
#Definition of k means clustering for the selected DR:
def kmeans(k,DR_df_scale):
    """input: k = the number of clusters
              Dr_df_scale = the selected DR technique
       the function: performs k means and provide the silhouette coefficient for the selected DR technique
       output: silhouette coefficient"""
    kmeans_DR_scale = KMeans(n_clusters=k, n_init=100, max_iter=400, init='k-means++', random_state=42).fit(DR_df_scale)
    kmeans_scaled_Silhouette_Score = silhouette_score(DR_df_scale, kmeans_DR_scale.labels_, metric='euclidean')
    kmeans_labels = kmeans_DR_scale.labels_
    #clusters_DR_scale = pd.concat([DR_df_scale, pd.DataFrame({'DR_clusters':kmeans_labels})], axis=1)
    return kmeans_scaled_Silhouette_Score, kmeans_labels

# This doesn't return the sihloette score

In [6]:
#Definition of hierarchical clustering for the selected DR: 
def hierarchical(n,DR_df_scale):
    cluster_hie = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
    cluster_hie.fit_predict(DR_df_scale)
    #hierarchical_labels=cluster_hie.labels_
    hierarchical_scaled_Silhouette_Score = silhouette_score(DR_df_scale, cluster_hie.labels_, metric='euclidean')
    hierarchical_labels=cluster_hie.labels_
    return hierarchical_scaled_Silhouette_Score, hierarchical_labels

In [7]:
for DR in DRs:
    for k in range(2,7):
        
        df_ = DRs[DR]['DRdf']
        
        kmeans_silhouette, kmeans_labels = kmeans(k,df_)
        hierarchical_silhouette, hierarchical_labels = hierarchical(k,df_)
        
        km_str_name_s = DR + "_kmeans_" + str(k) + "_silhouette"
        h_str_name_s = DR + "_hierarchical_" + str(k) + "_silhouette"
        km_str_name_l = DR + "_kmeans_" + str(k) + "_labels"
        h_str_name_l = DR + "_hierarchical_" + str(k) + "_labels"
        
        DRs[DR][km_str_name_s] = kmeans_silhouette
        DRs[DR][h_str_name_s] = hierarchical_silhouette
        DRs[DR][km_str_name_l] = kmeans_labels
        DRs[DR][h_str_name_l] = hierarchical_labels

In [8]:
df_silhouette = pd.DataFrame(columns=['PCA', 'tSNE', 'UMAP'], 
                           index=['kmeans 6', 'hierarchical 6', 'kmeans 3', 'hierarchical 3'])

for DR in DRs:
    k=6
    kmeans_silhouette_6 = DRs[DR][DR + "_kmeans_" + str(k) + "_silhouette"]
    hierarchical_silhouette_6 = DRs[DR][DR + "_hierarchical_" + str(k) + "_silhouette"]

    k=3
    kmeans_silhouette_3 = DRs[DR][DR + "_kmeans_" + str(k) + "_silhouette"]
    hierarchical_silhouette_3 = DRs[DR][DR + "_hierarchical_" + str(k) + "_silhouette"]
    
    df_silhouette[DR] = [kmeans_silhouette_6, hierarchical_silhouette_6,
              kmeans_silhouette_3, hierarchical_silhouette_3]

df_silhouette.round(2)

Unnamed: 0,PCA,tSNE,UMAP
kmeans 6,0.25,0.17,0.39
hierarchical 6,0.22,0.15,0.37
kmeans 3,0.29,0.2,0.41
hierarchical 3,0.24,0.16,0.38
