In [124]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [105]:
df = pd.read_csv('IC50.csv')

In [106]:
process_df = df[df.standard_value.notna()]
process_df =  process_df[process_df.canonical_smiles.notna()]

In [107]:
# dropping rows where the chemical compound is same.
process_df = process_df.drop_duplicates(['canonical_smiles'])

In [121]:
bioactivity_threshold = []
for i in process_df.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")
process_df['class'] = bioactivity_threshold

In [108]:
col_to_drop = ['activity_properties', 'ligand_efficiency', 'bao_endpoint', 'assay_variant_accession', 'assay_variant_mutation',
               'standard_upper_value', 'target_chembl_id', 'target_organism', 'target_pref_name', 'target_tax_id', 'text_value', 'toid',
               'qudt_units', 'src_id', 'upper_value', 'uo_units', 'bao_format', 'relation', 'standard_relation', 'record_id', 'standard_units', 'units', 'value', 'document_year',
               'potential_duplicate', 'pchembl_value', 'standard_text_value', 'standard_type', 'activity_comment', 'data_validity_comment', 'data_validity_description']
df_final = process_df.drop(col_to_drop, axis = 1) 

In [109]:
df_final.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,bao_label,canonical_smiles,document_chembl_id,document_journal,document_year,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,standard_flag,standard_value,type
0,33969,CHEMBL643384,Inhibitory concentration against acetylcholine...,B,single protein format,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,CHEMBL1148382,J. Med. Chem.,2004.0,CHEMBL133897,,CHEMBL133897,1,750.0,IC50
1,37563,CHEMBL643384,Inhibitory concentration against acetylcholine...,B,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,CHEMBL1148382,J. Med. Chem.,2004.0,CHEMBL336398,,CHEMBL336398,1,100.0,IC50
2,37565,CHEMBL643384,Inhibitory concentration against acetylcholine...,B,single protein format,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,CHEMBL1148382,J. Med. Chem.,2004.0,CHEMBL131588,,CHEMBL131588,1,50000.0,IC50
3,38902,CHEMBL643384,Inhibitory concentration against acetylcholine...,B,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,CHEMBL1148382,J. Med. Chem.,2004.0,CHEMBL130628,,CHEMBL130628,1,300.0,IC50
4,41170,CHEMBL643384,Inhibitory concentration against acetylcholine...,B,single protein format,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,CHEMBL1148382,J. Med. Chem.,2004.0,CHEMBL130478,,CHEMBL130478,1,800.0,IC50


In [112]:
df_final['standard_flag'] = df_final['standard_flag'].astype(float)
df_final['standard_value'] = df_final['standard_value'].astype(float)

In [118]:
input_features = list(df_final.columns)
input_features.remove('standard_value')
output_features = ['standard_value']
for single_col in input_features:
  if single_col in set(df_final.select_dtypes('object').columns):
    df_final[single_col] = LabelEncoder().fit_transform(df_final[single_col])

In [114]:
df_final = df_final.dropna()

In [122]:
def print_stats(df, *args, **kwargs):
    print('*' * 40)
    print('Our Dataset has {} Rows and {} Columns'.format(df.shape[0], df.shape[1]))
    print("-" * 85)
    display(df.info())
    print("-" * 85)
    print('Total Missing Value in each Columns')
    display(df.isna().sum())
    print('Common Stats of each columns')
    display(df.describe(include=['int', 'float']))

In [123]:
print_stats(df_final)

****************************************
Our Dataset has 5799 Rows and 15 Columns
-------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5799 entries, 0 to 8388
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   activity_id                5799 non-null   int64  
 1   assay_chembl_id            5799 non-null   int64  
 2   assay_description          5799 non-null   int64  
 3   assay_type                 5799 non-null   int64  
 4   bao_label                  5799 non-null   int64  
 5   canonical_smiles           5799 non-null   int64  
 6   document_chembl_id         5799 non-null   int64  
 7   document_journal           5799 non-null   int64  
 8   document_year              5799 non-null   float64
 9   molecule_chembl_id         5799 non-null   int64  
 10  molecule_pref_name         5799 non-null   int64

None

-------------------------------------------------------------------------------------
Total Missing Value in each Columns


activity_id                  0
assay_chembl_id              0
assay_description            0
assay_type                   0
bao_label                    0
canonical_smiles             0
document_chembl_id           0
document_journal             0
document_year                0
molecule_chembl_id           0
molecule_pref_name           0
parent_molecule_chembl_id    0
standard_flag                0
standard_value               0
type                         0
dtype: int64

Common Stats of each columns


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,bao_label,canonical_smiles,document_chembl_id,document_journal,document_year,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,standard_flag,standard_value,type
count,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0,5799.0
mean,11501630.0,274.949991,291.077944,0.980169,3.559924,2913.014313,230.621142,7.086567,2011.615451,2910.819969,257.722193,2855.97603,0.998793,2599642000000.0,0.135196
std,8136197.0,183.66483,143.066457,0.150152,1.14843,1680.607997,147.150006,2.906753,7.725951,1678.413419,29.775478,1660.035359,0.034725,107042500000000.0,0.589959
min,33969.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1986.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2389036.0,113.0,169.0,1.0,4.0,1458.5,107.0,5.0,2008.0,1458.5,263.0,1416.5,1.0,143.85,0.0
50%,14553010.0,253.0,285.0,1.0,4.0,2914.0,220.0,6.0,2014.0,2912.0,263.0,2851.0,1.0,2350.0,0.0
75%,18385660.0,411.0,407.0,1.0,4.0,4366.5,352.0,10.0,2017.0,4363.5,263.0,4293.5,1.0,17000.0,0.0
max,23311840.0,625.0,538.0,2.0,5.0,5823.0,484.0,16.0,2021.0,5823.0,263.0,5733.0,1.0,5888437000000000.0,3.0


In [126]:
model_metric = {}
X = df_final[input_features]
y = df_final[output_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [132]:
def fit_predict(model, X_train, X_test, y_train, y_test, model_name, *args, **kwargs):
    global model_metric
    print(f'Starting Training of Model {model_name}')
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    model_metric[model_name] = {'Number of Features': len(X_train.columns),
                                'Features Used': list(X_train.columns),
                                'R2 Score': r2_score(y_test, y_predicted),
                                'Mean Square Error': mean_squared_error(y_test, y_predicted),
                                'Mean Absolute Error': mean_absolute_error(y_test, y_predicted)}
    print(f'\nCalculated Regression Metric for model {model_name}')
    print('R2 Score -->', model_metric[model_name]['R2 Score'])
    print('Mean Square Error -->', model_metric[model_name]['Mean Square Error'])
    print('Mean Absolute Error -->', model_metric[model_name]['Mean Absolute Error'])
    print('\n')
    print(f'Completed Training of the Model {model_name}')
    print('*' * 40)
    return model

In [133]:
lr_model = LinearRegression()
lr_model = fit_predict(lr_model, X_train, X_test, y_train, y_test, 'Linear Regression')

Starting Training of Model Linear Regression

Calculated Regression Metric for model Linear Regression
R2 Score --> 0.0027043890111021485
Mean Square Error --> 1.2131678127948133e+28
Mean Absolute Error --> 11810834703566.807


Completed Training of the Model Linear Regression
****************************************


In [134]:
support_vector = SVR()
support_vector = fit_predict(support_vector, X_train, X_test, y_train, y_test, 'Support Vector Regression')

Starting Training of Model Support Vector Regression


  y = column_or_1d(y, warn=True)



Calculated Regression Metric for model Support Vector Regression
R2 Score --> -0.0007089749484465724
Mean Square Error --> 1.2173200252817132e+28
Mean Absolute Error --> 2936729399099.173


Completed Training of the Model Support Vector Regression
****************************************


In [135]:
random_forest = RandomForestRegressor()
random_forest = fit_predict(random_forest, X_train, X_test, y_train, y_test, f'Random Forest Regression')

Starting Training of Model Random Forest Regression


  model.fit(X_train, y_train)



Calculated Regression Metric for model Random Forest Regression
R2 Score --> 0.1356949410122269
Mean Square Error --> 1.0513904467703123e+28
Mean Absolute Error --> 4436967889779.154


Completed Training of the Model Random Forest Regression
****************************************
