In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


import rdkit
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

rdkit.__version__

'2023.03.1'

In [None]:
df_smiles = pd.read_csv('tested_molecules-1.csv')
print(df_smiles)

FileNotFoundError: ignored

In [None]:
mol = Chem.MolFromSmiles(df_smiles.iloc[566]['SMILES'])
print(mol)
mol

In [None]:
def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [None]:
getMolDescriptors(mol)

In [None]:
allDescrs = []
for _, row in df_smiles.iterrows():
    mol = row['SMILES']
    m = Chem.MolFromSmiles(mol)
    descriptors = getMolDescriptors(m)
    allDescrs.append(descriptors)
    
df_descr = pd.DataFrame(allDescrs)
df_descr.head()

Delete the collumns that have only one unique value, so a column where each value is the mean, and variance is 0

In [None]:
#Deletes colums with nonsense values (only the same values)
bad_descr=[]
for i in df_descr.columns:
    if df_descr[i].nunique() == 1:
        bad_descr.append(i)
print(bad_descr)       
df_nonzero=df_descr.drop(columns=bad_descr)  

Remove variables so that no variables are left that have a corralation with another variable higher than the set threshold for corralarion

In [None]:
corrThreshold = 0.90


correlation_mat = df_nonzero.corr()
upper_tri = correlation_mat.where(np.triu(np.ones(correlation_mat.shape),k=1).astype(bool)) 
to_drop = [column for column in upper_tri.columns if any(upper_tri[column].abs() > corrThreshold)] 
df_noCorr = df_nonzero.drop(columns=to_drop)
df_noCorr.head()

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_noCorr)

pca = PCA(n_components=50)
principalComponents = pca.fit_transform(df_scaled)
principalDf = pd.DataFrame(data = principalComponents)
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

In [None]:
plt.bar(range(1,len(cum_sum_eigenvalues)+1), cum_sum_eigenvalues, align='center',label='Cumulative explained variance')
plt.ylabel('Cumulative variance ratio')
plt.xlabel('Number of Components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

#scree plot om aantal PCA te bepalen
plt.bar(range(1,len(exp_var_pca)+1), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Number of Components')
plt.legend(loc='best')

In [None]:
def good_correlation(df1, treshold=0.3):
    cm = df1.corr() #correlation matrix
    np.fill_diagonal(cm.values, 0) # set diagonal to 0 
    corr = [(cm.index[x], cm.columns[y], cm.iloc[x,y]) for x, y in zip(*np.where(abs(np.tril(cm)) > treshold))] # create couple (feature1, feature2, value)
    for couple in corr:
        feature1, feature2, value = couple
        #print(f'{feature1} and {feature2} are strongly correlated (treshold = {treshold}) (value = {value})')

    return cm, corr


In [None]:
good_correlation(df_noCorr, treshold=0.9)[1]

In [None]:
#verwijderen van features die lage variance hebben
#x is hierbij je matrix met alle features.
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8*(1-.8)))
df_noCorr = selection.fit_transform(df_noCorr)
