In [None]:
# Import required modules

import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.feature_selection import VarianceThreshold


In [None]:
# Read in file

df = pd.read_excel(r'/home/clodaghboland/MA5118/muscle_kidney_final.xlsx')

In [None]:
# Format dataframe

df=df.set_index('Unnamed: 0')
df.head()

In [None]:
# Split descriptors and output variables

x=df.drop(['Muscle_Cells'], axis=1)
y=df.loc[:,['Muscle_Cells']]

In [None]:
# Remove veriables with zero variance

selector = VarianceThreshold(threshold = 0.5)
fitted = selector.fit(x)
new_variables = selector.get_support()
reduced = x.loc[:, new_variables]
reduced.shape

In [None]:
# Perform 2 component pca on the data

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(reduced)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
# Forment pca dataframe and add 'y' variables

y.reset_index(inplace=True)
y=y.drop(['Unnamed: 0'], axis=1)
finalDf = pd.concat([principalDf, y], axis = 1)
finalDf.head()

In [None]:
# Plot 2 component pca

plt.scatter(finalDf['principal component 1'], finalDf['principal component 2'],\
            c=finalDf['Muscle_Cells'], cmap='coolwarm')
plt.title("2 component PCA: 688 genes, 24 chemicals",fontsize = 15)
plt.ylabel("Principal Component 2",fontsize = 15)
plt.xlabel("Principal Component 1", fontsize = 15)
plt.savefig("PCA_muscle_LVF.png") # save as png

In [None]:
# Compute the explained variance ratio

explained_variance_ratio =  pca.explained_variance_ratio_
explained_variance_ratio

In [None]:
# Compute the number of clusters required to represent 95% of the variance

pca2 = PCA(.95)
pca2.fit(reduced)
no_components = pca2.n_components_
no_components

In [None]:
# Plot a dendrogram of data using hierarchical clustering analysis

#X=df.drop(['Muscle_Cells'], axis=1)
dendrogram = sch.dendrogram(sch.linkage(reduced, method='ward'))
plt.title("Dendrogram: 688 genes, 24 chemicals",fontsize = 15)
plt.ylabel("Similarity",fontsize = 15)
plt.xlabel("Chemicals", fontsize = 10)
plt.savefig("dendrogram_muscle_LVF.png") 

In [None]:
# Perform agglomerative hierarchical clustering on the top 2 components 

model = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
model.fit(reduced)
labels = model.labels_

In [None]:
# Plot top 2 clusters in hierarchical clustering

X = np.array(reduced)
plt.scatter(X[labels==0, 0], X[labels==0, 1], s=50, marker='o', color='red')
plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue')
#plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green')
#plt.scatter(X[labels==3, 0], X[labels==3, 1], s=50, marker='o', color='brown')
#plt.scatter(X[labels==4, 0], X[labels==4, 1], s=50, marker='o', color='black')

plt.title("Hieracrhicial Clustering: 688 genes, 24 chemicals",fontsize = 15)

plt.show()
plt.savefig("hier_muscle_final.png") 


In [None]:
df2=df.reset_index()

In [None]:
df2.head()

In [None]:
df3=df2['Unnamed: 0']

In [None]:
df3