In [2]:
import pandas as pd
from papyrus import Papyrus, PapyrusDataset
from chemutils import generate_ecfp, generate_mol_descriptors
from sklearn.model_selection import train_test_split

from papyrus_scripts.modelling import pcm, qsar
import xgboost
from torch.utils.data import DataLoader
from models.baselines import train_ensemble, train_model, test_model

# Import Uncertainty Toolbox
import uncertainty_toolbox as uct

In [2]:
# RUN FOR THE FIRST TIME ONLY
# papyrus_xc50 = Papyrus(
#     path="data/",
#     chunksize=1000000,
#     accession=None,
#     activity_type=["IC50", "EC50"],
#     protein_class=None,
#     verbose_files=True
# )
# df_xc50 = papyrus_xc50()
# df_xc50.to_csv("data/papyrus_filtered_high_quality_xc50.csv", index=False)
dtypes = {
    "typeIC50": 'str', #'int32',
    "typeEC50": 'str', # 'int32',
    "typeKi": 'str', # 'int32',
    "typeKd": 'str', # 'int32',
    "relation": 'str',
    'activityClass': 'str',
    "pchemblValue": 'str',
    "pchemblValueMean": 'float64',
    "pchemblValueStdDev": 'float64',
    "pchemblValueSEM": 'float64',
    "pchemblValueN": 'int32',
    "pchemblValueMedian": 'float64',
    "pchemblValueMAD": 'float64',
}
# Read column names from file
# cols = list(pd.read_csv("data/papyrus_filtered_high_quality_01_standardized.csv", nrows =1))

df_xc50 = pd.read_csv(
    "data/papyrus_filtered_high_quality_01_standardized.csv",
    index_col=0,
    header=0,
    dtype=dtypes,
    # chunksize=100000,
    low_memory=False  # , usecols =[i for i in cols if i != 'activityClass']
)

df_xc50["pchemblValue"] = (
    df_xc50["pchemblValue"]
    .str.split(";")
    .apply(lambda x: [float(i) for i in x] if type(x) != float else x))

In [3]:
# df_xc50 = pd.read_csv("data/papyrus_filtered_high_quality_xc50.csv")
# df_xc50 = generate_ecfp(df_xc50, 2, 1024, False, False)
# df_xc50.to_csv("data/papyrus_filtered_high_quality_xc50_04_with_ECFP.csv", index=False)

# df_xc50 = generate_mol_descriptors(df_xc50, 'smiles', None)
# df_xc50.to_csv("data/papyrus_filtered_high_quality_xc50_05_with_molecular_descriptors.csv", index=False)

In [4]:
# dtypes = {
#     "typeIC50": 'str', #'int32',
#     "typeEC50": 'str', # 'int32',
#     "typeKi": 'str', # 'int32',
#     "typeKd": 'str', # 'int32',
#     "relation": 'str',
#     'activityClass': 'str',
#     "pchemblValue": 'str',
#     "pchemblValueMean": 'float64',
#     "pchemblValueStdDev": 'float64',
#     "pchemblValueSEM": 'float64',
#     "pchemblValueN": 'int32',
#     "pchemblValueMedian": 'float64',
#     "pchemblValueMAD": 'float64',
# }
#
# import pandas as pd
# df_xc50 = pd.read_csv(
#     "data/papyrus_filtered_high_quality_xc50_04_with_ECFP.csv",
#     index_col=0,
#     header=0,
#     dtype=dtypes,
#     converters={"ECFP": eval,
#                 "pchemblValue": eval}
# )
#     # chunksize=100000,

In [5]:
# Filtering the top 25 targets in number of datapoints.
# step 1: group the dataframe by protein target
print(df_xc50.shape)
grouped = df_xc50.groupby('accession')
# step 2: count the number of measurements for each protein target
counts = grouped['accession'].count()

# step 3: sort the counts in descending order
sorted_counts = counts.sort_values(ascending=False)

# step 4: select the 20 protein targets with the highest counts
top_targets = sorted_counts.head(25)

# step 5: filter the original dataframe to only include rows corresponding to these 20 protein targets
filtered_df = df_xc50[df_xc50['accession'].isin(top_targets.index)]
print(filtered_df.shape)

(452115, 33)
(111574, 33)


In [None]:
from models.baselines import train_ensemble, train_model, test_model
from papyrus import PapyrusDataset
# step 6: iterate over the 25 protein targets
all_ensembles = dict()
for protein_target in top_targets.index:
    # create a new dataframe containing only the rows corresponding to the current protein target
    target_df = df_xc50[df_xc50['accession'] == protein_target]

    train, test = train_test_split(target_df, test_size=0.3, random_state=42)
    valid, test = train_test_split(test, test_size=0.5, random_state=42)

    train_dataset = PapyrusDataset(train)
    valid_dataset = PapyrusDataset(valid)
    test_dataset = PapyrusDataset(test)
    # Create DataLoader for training and validation sets
    batch_size = 32

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    ensemble = train_ensemble(
        train_loader=train_loader,
        val_loader=valid_loader,
        test_loader=test_loader,
        input_size=1024,
        hidden_size1=512,
        hidden_size2=128,
        hidden_size3=32,
        output_size=1,
        num_epochs=3000
    )
    all_ensembles.update({protein_target: ensemble})

Epoch 100/3000 -- Training Loss: 0.0003, Validation Loss: 0.2830
Epoch 200/3000 -- Training Loss: 0.0001, Validation Loss: 0.2841
Stopped early after 229 epochs
Epoch 100/3000 -- Training Loss: 0.0004, Validation Loss: 0.2843
Epoch 200/3000 -- Training Loss: 0.0001, Validation Loss: 0.2823
Stopped early after 227 epochs
Epoch 100/3000 -- Training Loss: 0.0004, Validation Loss: 0.2887


# Kx

In [3]:
papyrus_kx = Papyrus(
    path="data/",
    chunksize=1000000,
    accession=None,
    activity_type=["Ki", "Kd"],
    protein_class=None,
    verbose_files=True
)
df_kx = papyrus_kx()
df_kx.to_csv("data/papyrus_filtered_high_quality_kx.csv", index=False)

Latest version: 05.6
Number of files to be downloaded: 10
Total size: 8.25GB


Downloading version 05.6:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

  0%|          | 0/60 [00:00<?, ?it/s]



In [None]:
# df_kx = pd.read_csv("data/papyrus_filtered_high_quality_kx.csv")
#
# df_kx = generate_ecfp(df_kx, 2, 1024, False, False)
# df_kx.to_csv("data/papyrus_filtered_high_quality_kx_04_with_ECFP.csv", index=False)
#
# df_kx = generate_mol_descriptors(df_kx, 'smiles', None)
# df_kx.to_csv("data/papyrus_filtered_high_quality_kx_05_with_molecular_descriptors.csv", index=False)

In [None]:
# Filtering the top 25 targets in number of datapoints.
# step 1: group the dataframe by protein target
grouped = df_kx.groupby('accession')
# step 2: count the number of measurements for each protein target
counts = grouped['accession'].count()
# step 3: sort the counts in descending order
sorted_counts = counts.sort_values(ascending=False)

# step 4: select the 20 protein targets with the highest counts
top_targets = sorted_counts.head(25)

# step 5: filter the original dataframe to only include rows corresponding to these 20 protein targets
filtered_df = df_kx[df_kx['accession'].isin(top_targets.index)]


In [None]:
# step 6: iterate over the 25 protein targets
for protein_target in top_targets.index:
    # create a new dataframe containing only the rows corresponding to the current protein target
    target_df = df_xc50[df_xc50['protein_target'] == protein_target]