In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from decimal import Decimal, ROUND_HALF_UP

***1) Ki section***

In [None]:
# Dataset importing
imported_db_Ki = pd.read_csv(r'/filepath1/file.csv', sep=';')

In [None]:
# Certain columns selection
list_of_columns = ['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value',
'Standard Units','Document ChEMBL ID', 'Target Name', 'Action Type']
df_Ki = pd.DataFrame(data=imported_db_Ki,
                  columns=list_of_columns)


df_Ki.rename(columns={'Molecule ChEMBL ID':'Molecule_ChEMBL_ID',
                   'Standard Type':'Standard_Type',
                   'Standard Relation':'Standard_Relation',
                   'Standard Value':'Standard_Value',
                   'Standard Units':'Standard_Units',
                   'Document ChEMBL ID':'Document_ChEMBL_ID',
                   'Action Type':'Action_Type',
                   'Target Name':'Target_Name'}, inplace=True)

In [None]:
# Deleting NaN values
df_dropped_Ki = df_Ki.dropna()

In [None]:
# Filtering 'df_dropped_Ki' dataframe for records equal to 'ANTAGONIST' or 'BLOCKER' in 'Action_Type' column
# Filtering 'df_dropped_Ki' dataframe for records equal with activity equal or less than 50
df_dropped_Ki = df_dropped_Ki[df_dropped_Ki['Action_Type'].isin(['ANTAGONIST', 'BLOCKER'])]
df_dropped_Ki = df_dropped_Ki[df_dropped_Ki['Standard_Value'] <= 50]

***2) IC50 section***

In [None]:
# Dataset importing
imported_db_IC50 = pd.read_csv(r'/filepath1/file.csv', sep=';')

In [None]:
# Certain columns selection
list_of_columns = ['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value',
'Standard Units','Document ChEMBL ID', 'Target Name', 'Action Type']
df_IC50 = pd.DataFrame(data=imported_db_IC50,
                  columns=list_of_columns)


df_IC50.rename(columns={'Molecule ChEMBL ID':'Molecule_ChEMBL_ID',
                   'Standard Type':'Standard_Type',
                   'Standard Relation':'Standard_Relation',
                   'Standard Value':'Standard_Value',
                   'Standard Units':'Standard_Units',
                   'Document ChEMBL ID':'Document_ChEMBL_ID',
                   'Action Type':'Action_Type',
                   'Target Name':'Target_Name'}, inplace=True)

In [None]:
# Deleting NaN values
df_dropped_IC50 = df_IC50.dropna()

In [None]:
# Filtering 'df_dropped_Ki' dataframe for records equal to 'ANTAGONIST' or 'BLOCKER' in 'Action_Type' column
# Filtering 'df_dropped_Ki' dataframe for records equal with activity equal or less than 50
df_dropped_IC50 = df_dropped_IC50[df_dropped_IC50['Action_Type'].isin(['ANTAGONIST', 'BLOCKER'])]
df_dropped_IC50 = df_dropped_IC50[df_dropped_IC50['Standard_Value'] <= 50]

In [None]:
# Ki- IC50 conversion factor of 2 propsed by Kalliokoski et al. in "Comparability of Mixed IC50 Data – A Statistical Analysis"
df_dropped_IC50.Standard_Type = 'Ki'
df_dropped_IC50.Standard_Value = df_dropped_IC50.Standard_Value/2

**3) Łączenie dataframe'ów**

In [None]:
# Concatenating dataframes
output_df = pd.concat([df_dropped_IC50, df_dropped_Ki])

In [None]:
# Calculating the arithmetic mean of 'Standard_Values' for duplicate records (defined by 'Molecule_ChEMBL_ID' column) and subsequent removal of duplicated records
features = [col for col in output_df.columns if col not in ['Standard_Value', 'Molecule_ChEMBL_ID']]

agg_dict = {'Standard_Value': 'mean'}
agg_dict.update({feature: 'first' for feature in features})

optimized_output_df = output_df.groupby('Molecule_ChEMBL_ID').agg(agg_dict).reset_index()


In [None]:
# Values rounding
optimized_output_df['Standard_Value'] = optimized_output_df['Standard_Value'].apply(
    lambda x: Decimal(str(x)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))

In [166]:
# Reindexing dataframe's columns
optimized_output_df = optimized_output_df.reindex(columns=['Molecule_ChEMBL_ID', 'Smiles', 'Standard_Type',
       'Standard_Relation', 'Standard_Value', 'Standard_Units',
       'Document_ChEMBL_ID', 'Target_Name', 'Action_Type'])
optimized_output_df

Unnamed: 0,Molecule_ChEMBL_ID,Smiles,Standard_Type,Standard_Relation,Standard_Value,Standard_Units,Document_ChEMBL_ID,Target_Name,Action_Type
0,CHEMBL111,Cc1c(C(=O)NN2CCCCC2)nn(-c2ccc(Cl)cc2Cl)c1-c1cc...,Ki,'=',10.8,nM,CHEMBL1140299,Cannabinoid CB1 receptor,ANTAGONIST
1,CHEMBL334533,CCCCCCC(C)(C)c1cc(O)c2c(c1)OC(C)(C)[C@H]1CC=C(...,Ki,'=',19.0,nM,CHEMBL1140299,Cannabinoid CB1 receptor,ANTAGONIST
2,CHEMBL4781965,COc1ccc(-c2cc(C(=O)NC3CCCCCC3)c(=O)n(Cc3ccc(F)...,Ki,'=',9.8,nM,CHEMBL4706601,Cannabinoid CB1 receptor,ANTAGONIST
3,CHEMBL4856192,CCCCCn1cc(C(=O)c2ccc3c(c2)CC(C)(C)C3)c2ccccc21,Ki,'=',38.8,nM,CHEMBL4825669,Cannabinoid CB1 receptor,ANTAGONIST
4,CHEMBL4860950,CCCCCn1cc(C(=O)c2cccc3c2CC(C)(C)C3)c2ccccc21,Ki,'=',1.46,nM,CHEMBL4825669,Cannabinoid CB1 receptor,ANTAGONIST
5,CHEMBL5070349,CS/C(N)=N/C(=N/S(=O)(=O)N1CCC(C(F)(F)F)CC1)N1C...,Ki,'=',0.85,nM,CHEMBL5046268,Cannabinoid CB1 receptor,ANTAGONIST
6,CHEMBL5071344,N/C(=N\C(=N\S(=O)(=O)N1CCC(C(F)(F)F)CC1)N1CC(c...,Ki,'=',4.1,nM,CHEMBL5046268,Cannabinoid CB1 receptor,ANTAGONIST
7,CHEMBL5073465,C/C(N)=N/C(=N/S(=O)(=O)N1CCC(C(F)(F)F)CC1)N1CC...,Ki,'=',3.9,nM,CHEMBL5046268,Cannabinoid CB1 receptor,ANTAGONIST
8,CHEMBL5075684,CCN(CC)S(=O)(=O)/N=C(/N=C(\N)c1ccc(F)cc1)N1CC(...,Ki,'=',5.5,nM,CHEMBL5046268,Cannabinoid CB1 receptor,ANTAGONIST
9,CHEMBL5076422,CS/C(N)=N\C(=N\S(=O)(=O)N1CCC(F)(F)CC1)N1CC(c2...,Ki,'=',14.0,nM,CHEMBL5046268,Cannabinoid CB1 receptor,ANTAGONIST


In [None]:
# Output dataframe saving
# optimized_output_df.to_csv('/output_directory_path/Filtered_complete_output_df.csv', index=False)