# **1) Initial Instructions**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/path

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# **2) Ki**

In [None]:
data_ki = pd.read_csv('/content/drive/MyDrive/path_ki.csv', sep=';', usecols=[0,7,8,9,10,11,35,36,38,45])
df_ki = pd.DataFrame(data_ki)
df_ki.shape

In [None]:
df_ki = df_ki[df_ki['Action Type'].isin(['ANTAGONIST', 'INHIBITOR'])].copy()
df_ki.shape

# **3) IC50**

In [None]:
data_ic50 = pd.read_csv('/content/drive/MyDrive/path_ic50.csv', sep=';', usecols=[0,7,8,9,10,11,35,36,38,45])
df_ic50 = pd.DataFrame(data_ic50)

In [None]:
df_ic50 = df_ic50[df_ic50['Action Type'].isin(['ANTAGONIST', 'INHIBITOR'])].copy()
df_ic50['Standard Value'] = df_ic50['Standard Value']/2
df_ic50.shape

# **4) Datasets merging**

In [None]:
df_merge = pd.merge(df_ki, df_ic50, how='outer')
df_merge['Standard Type'] = df_merge['Standard Type'].replace('IC50', 'Ki')
df_merge = df_merge.dropna(subset=['Standard Value'])
print(f'Merged df: {df_merge.shape}')

In [None]:
df_merge_duplicated = df_merge[df_merge.duplicated(subset=['Molecule ChEMBL ID'], keep=False)].sort_values(by='Molecule ChEMBL ID')

df_aggregated = (
    df_merge_duplicated
    .groupby('Molecule ChEMBL ID', as_index=False)
    .agg({'Standard Value': 'mean', **{col: 'first' for col in df_merge_duplicated.columns if col not in ['Molecule ChEMBL ID', 'Standard Value']}})
)
df_merge = df_merge.drop_duplicates(subset=['Molecule ChEMBL ID'], keep=False)


print(f'Duplicates count: {df_merge_duplicated.shape[0]}')
print(f'Number of rows after aggregation: {df_aggregated.shape[0]}')
print(f'Number of records after removing duplicates: {df_merge.shape[0]}')

# **5) Dataset Save**

In [None]:
#Preprocessed DataFrame (with labels)
df_final = pd.merge(df_merge, df_aggregated, how='outer')
df_final['Bin_Activity'] = np.where(df_final['Standard Value'] <= 50, 1, 0)

df_final.to_csv('df_final.csv', index=False, sep=',')
df_final.shape

In [None]:
#Csv file for docking (only IDs and smiles)
df_to_dock = df_final[['Molecule ChEMBL ID', 'Smiles']]
df_to_dock.to_csv('df_to_dock.csv', index=False, sep=',')
df_to_dock.shape

# **6) Classes Distribution Visualization**

In [None]:
classes_distribution = df_final['Bin_Activity'].value_counts()

bars_colors = ['green', 'orange']
plt.figure(figsize=(8,6))
plt.bar(classes_distribution.index, classes_distribution.values, color=bars_colors)
plt.xticks(classes_distribution.index)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Classes Distribution')
plt.grid()
plt.gca().set_facecolor('#f0f0f0')
plt.show()