Explore Biofacquim against the world.
Perform a PCA and save results

In [1]:
"""Import Libraries"""
import pandas as pd
import numpy as np

import sklearn
from sklearn import datasets, decomposition
from sklearn.preprocessing import StandardScaler

In [2]:
"""Open Database"""
Data = pd.read_csv("Databases_CABANA_2.csv", sep = ",")
Data.head()

Unnamed: 0.1,Unnamed: 0,ID Database,Name,SMILES,HBA,HBD,RB,LogP,TPSA,MW,Heavy Atom,Ring Count,Fraction CSP3,subLibrary,Library
0,0,AfroDb.564,NPR_00036,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(OC...,7.0,3.0,51.0,22.4139,105.45,1194.003,85.0,85.0,85.0,AFRODB,AFRODB
1,1,AfroDb.71,ABD_UD_004,C[C@H](CCC(O[C@H](C[C@@H]([C@@H]1CC2)[C@H]3[C@...,27.0,14.0,18.0,-3.5356,418.89,1195.309,83.0,83.0,83.0,AFRODB,AFRODB
2,2,AfroDb.70,ABD_UD_003,C[C@H](CC[C@@]([C@H]1C)(OC)O[C@H](C2)[C@H]1[C@...,26.0,14.0,17.0,-2.6588,393.98,1195.353,83.0,83.0,83.0,AFRODB,AFRODB
3,3,AfroDb.937,WA_0086,COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,11.0,3.0,61.0,16.9936,201.42,1117.597,79.0,79.0,79.0,AFRODB,AFRODB
4,4,AfroDb.936,WA_0085,COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,8.0,3.0,52.0,17.018,122.52,943.489,67.0,67.0,67.0,AFRODB,AFRODB


In [3]:
#visualize column names
Data.columns

Index(['Unnamed: 0', 'ID Database', 'Name', 'SMILES', 'HBA', 'HBD', 'RB',
       'LogP', 'TPSA', 'MW', 'Heavy Atom', 'Ring Count', 'Fraction CSP3',
       'subLibrary', 'Library'],
      dtype='object')

In [4]:
#select numerical Data
numerical_descriptors = ['HBA', 'HBD', 'RB','LogP', 'TPSA', 'MW']

In [5]:
numerical_data = Data[numerical_descriptors]

In [6]:
"""Normalize Data"""
numerical_data = pd.DataFrame(StandardScaler().fit_transform(numerical_data))
# Calculate the principal components using scikit-learn
sklearn_pca = sklearn.decomposition.PCA() #call the method
sklearn_pca.fit(numerical_data)           #execute
numerical_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0.450761,0.216244,7.552274,8.573121,0.158183,4.218968
1,6.259774,4.114898,1.956077,-3.033163,4.287426,4.225989
2,5.969324,4.114898,1.786496,-2.641002,3.959263,4.226225
3,1.612563,0.216244,9.248091,6.148815,1.422487,3.808267
4,0.741211,0.216244,7.721856,6.159728,0.383062,2.872394


In [7]:
"""Perform PCA"""
# Perform the PCA again retaining only the top 2 components
sklearn_pca = sklearn.decomposition.PCA(n_components=6, svd_solver = "full", whiten = True)
sklearn_pca.fit(numerical_data)
sklearn_pca = sklearn_pca
pca_result = pd.DataFrame(sklearn_pca.transform(numerical_data), columns=['PC 1','PC 2',"PC 3", 'PC 4','PC 5',"PC 6"])
pca_result.head()

Unnamed: 0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
0,1.727453,-10.481176,-5.764133,5.01082,4.673651,0.762974
1,4.698225,1.221082,4.772969,-1.260738,0.375039,4.067114
2,4.491176,0.917275,4.549075,-1.782514,0.259125,5.05009
3,2.750055,-8.639492,-5.919068,9.498894,5.840558,-2.650968
4,1.833588,-7.986786,-5.872903,7.344716,6.014168,0.041645


In [8]:
"""Merge Names and SMILES"""
pca_result["Library"] = Data.Library
pca_result["SMILES"] = Data.SMILES
pca_result["Name"] = Data.Name
variance = list(sklearn_pca.explained_variance_ratio_)
a = round(variance[0] * 100, 2)
b = round(variance[1] * 100, 2)
print(pca_result.head())

       PC 1       PC 2      PC 3      PC 4      PC 5      PC 6 Library  \
0  1.727453 -10.481176 -5.764133  5.010820  4.673651  0.762974  AFRODB   
1  4.698225   1.221082  4.772969 -1.260738  0.375039  4.067114  AFRODB   
2  4.491176   0.917275  4.549075 -1.782514  0.259125  5.050090  AFRODB   
3  2.750055  -8.639492 -5.919068  9.498894  5.840558 -2.650968  AFRODB   
4  1.833588  -7.986786 -5.872903  7.344716  6.014168  0.041645  AFRODB   

                                              SMILES        Name  
0  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(OC...   NPR_00036  
1  C[C@H](CCC(O[C@H](C[C@@H]([C@@H]1CC2)[C@H]3[C@...  ABD_UD_004  
2  C[C@H](CC[C@@]([C@H]1C)(OC)O[C@H](C2)[C@H]1[C@...  ABD_UD_003  
3  COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...     WA_0086  
4  COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...     WA_0085  


In [9]:
#Save results as .csv File
pca_result.to_csv("Results_PCA.csv", sep = ",")