In [None]:
# Import modules

import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Import file

df = pd.read_excel(r'/home/clodaghboland/MA5118/myotoxicity_data_final.xlsx')

In [None]:
# Format dataframe

df=df.drop('Unnamed: 0', axis=1)
df=df.set_index('Smiles')
df=df.drop(['Name'], axis=1)
df.head()

In [None]:
# Split data into descriptors and output

x=df.drop(['toxic'], axis=1)
y=df.loc[:,['toxic']]

In [None]:
# Remove veriables with zero variance

selector = VarianceThreshold(threshold = 1)
fitted = selector.fit(x)
new_variables = selector.get_support()
reduced = x.loc[:, new_variables]

In [None]:
reduced.shape

In [None]:
reduced.head()

In [None]:
# Perform 2 component PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(reduced)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
# Format the pca dataframe and combine with 'y' variables

y.reset_index(inplace=True)
y=y.drop(['Smiles'], axis=1)
finalDf = pd.concat([principalDf, y], axis = 1)
finalDf.head()

In [None]:
finalDf.head(215)

In [None]:
# Plot pca 1 vs pca 2 

plt.scatter(finalDf['principal component 1'], finalDf['principal component 2'],\
            c=finalDf['toxic'], cmap='coolwarm')
plt.title("2 component PCA: 478 genes, 265 chemicals ",fontsize = 15)
plt.ylabel("Principal Component 2",fontsize = 15)
plt.xlabel("Principal Component 1", fontsize = 15)
plt.savefig("PCA_all_LVF.png") # save as png

In [None]:
# Compute the explained variance ratio of the 2 components 

explained_variance_ratio = pca.explained_variance_ratio_

In [None]:
explained_variance_ratio

In [None]:
# Compute number of components required to give 95% variance

pca1 = PCA(.95)
pca1.fit(reduced)
no_components = pca1.n_components_ 

In [None]:
no_components

In [None]:
# Plot a dendrogram of the data using wards method

#X=df.drop(['toxic'], axis=1)
dendrogram = sch.dendrogram(sch.linkage(reduced, method='ward'))
plt.title("Dendrogram: 478 genes, 265 chemicals",fontsize = 15)
plt.ylabel("Similarity",fontsize = 15)
plt.xlabel("Chemicals", fontsize = 15)
plt.savefig('dendrogram_all_LVF.jpg', format='jpg', bbox_inches='tight')

In [None]:
# Perform agglomerative clustering on the top 3 clusters 

model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
model.fit(reduced)
labels = model.labels_

In [None]:
# Plot the top 3 clusters after agglomerative clustering

X = np.array(reduced)
plt.scatter(X[labels==0, 0], X[labels==0, 1], s=50, marker='o', color='red')
plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue')
plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green')
#plt.scatter(X[labels==3, 0], X[labels==3, 1], s=50, marker='o', color='brown')
#plt.scatter(X[labels==4, 0], X[labels==4, 1], s=50, marker='o', color='black')

plt.title("Hieracrhicial Clustering: 478 genes, 265 chemicals", fontsize = 15)
plt.show()
plt.savefig("hiercluster_all_final.png") 