In [None]:
# First, install deepchem
!pip install --pre deepchem

Collecting deepchem
  Downloading deepchem-2.7.2.dev20231011182505-py3-none-any.whl (879 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/879.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/879.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m879.7/879.7 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem)
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.2.dev20231011182505 rdkit-2023.3.3


In [None]:
import deepchem
deepchem.__version__



'2.7.2.dev'

In [None]:
from rdkit import Chem

We want to install NVIDIA's Apex tool, for the training pipeline used by `simple-transformers` and Weights and Biases. This package enables us to use 16-bit training, mixed precision, and distributed training without any changes to our code. Generally GPUs are good at doing 32-bit(single precision) math, not at 16-bit(half) nor 64-bit(double precision). Therefore traditionally deep learning model trainings are done in 32-bit. By switching to 16-bit, we’ll be using half the memory and theoretically less computation at the expense of the available number range and precision. However, pure 16-bit training creates a lot of problems for us (imprecise weight updates, gradient underflow and overflow). **Mixed precision training, with Apex, alleviates these problems**.

We will be installing `simple-transformers`, a library which builds ontop of HuggingFace's `transformers` package specifically for fine-tuning ChemBERTa.

In [None]:
!git clone https://github.com/NVIDIA/apex
!cd /content/apex
!pip install -v --no-cache-dir /content/apex
!pip install transformers
!pip install simpletransformers
!pip install wandb
!cd ..

Cloning into 'apex'...
remote: Enumerating objects: 11363, done.[K
remote: Counting objects: 100% (3431/3431), done.[K
remote: Compressing objects: 100% (424/424), done.[K
remote: Total 11363 (delta 3159), reused 3068 (delta 3006), pack-reused 7932[K
Receiving objects: 100% (11363/11363), 15.37 MiB | 16.78 MiB/s, done.
Resolving deltas: 100% (7988/7988), done.
Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Processing ./apex
  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting setuptools
    Downloading setuptools-68.2.2-py3-none-any.whl (807 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 807.9/807.9 kB 19.4 MB/s eta 0:00:00
  Collecting wheel
    Downloading wheel-0.41.2-py3-none-any.whl (64 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.8/64.8 kB 9.6 MB/s eta 0:00:00
  Installing collected packages: wheel, setuptools
    Creating /t

In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex

Cloning into 'bertviz_repo'...
remote: Enumerating objects: 1633, done.[K
remote: Counting objects: 100% (329/329), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 1633 (delta 231), reused 225 (delta 211), pack-reused 1304[K
Receiving objects: 100% (1633/1633), 198.37 MiB | 16.22 MiB/s, done.
Resolving deltas: 100% (1073/1073), done.


We're going to clone an auxillary repository, bert-loves-chemistry, which will enable us to use the MolNet dataloader for ChemBERTa, which automatically generates scaffold splits on any MoleculeNet dataset!

In [None]:
!git clone https://github.com/seyonechithrananda/bert-loves-chemistry.git

Cloning into 'bert-loves-chemistry'...
remote: Enumerating objects: 1566, done.[K
remote: Counting objects: 100% (202/202), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 1566 (delta 96), reused 91 (delta 91), pack-reused 1364[K
Receiving objects: 100% (1566/1566), 55.35 MiB | 14.11 MiB/s, done.
Resolving deltas: 100% (1000/1000), done.


First, lets load the model's Byte-Pair Encoding tokenizer, and model, and setup a Huggingface pipeline for masked tokeni prediction.

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, RobertaModel, RobertaTokenizer
from bertviz import head_view

model = AutoModelForMaskedLM.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)


Downloading (…)lve/main/config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The visualization below shows the attention induced by a sample input SMILES. This view visualizes attention as lines connecting the tokens being updated (left) with the tokens being attended to (right), following the design of the figures above. Color intensity reflects the attention weight; weights close to one show as very dark lines, while weights close to zero appear as faint lines or are not visible at all. The user may highlight a particular SMILES character to see the attention from that token only. This visualization is called the attention-head view. It is based on the excellent Tensor2Tensor visualization tool, and are all generated by the [Bertviz](https://github.com/jessevig/bertviz) library.


In [None]:
from transformers import RobertaModel, RobertaTokenizer
from bertviz import head_view

model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
model = RobertaModel.from_pretrained(model_version, output_attentions=True)
tokenizer = RobertaTokenizer.from_pretrained(model_version)

Lets use the MolNet dataloader to generate scaffold splits from the ClinTox dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
dataset = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/JAK3_processed_PIC50.csv')
dataset.head()

Unnamed: 0,canonical_smiles,pIC50
0,O=C1NCc2c(-c3ccc(F)cc3F)cc(C3CCNCC3)cc2N1c1c(C...,5.0
1,CC(C)(C)c1nc2c3ccc(F)cc3c3c(=O)[nH]ccc3c2[nH]1,8.30103
2,Cc1cccc(Cl)c1NC(=O)c1cnc(NC(=O)C2CC2)s1,7.356547
3,O=C1Nc2ccc(NC(=O)c3ccccc3)cc2/C1=C/c1ccc[nH]1,6.060481
4,O=C1Nc2ccc(C(=O)O)cc2/C1=C/c1ccc[nH]1,6.67162


In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch.nn import functional as F
from transformers import AutoTokenizer, RobertaModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Assuming dataset is your data, X is your features and y is your labels
# X = dataset['canonical_smiles']
# y = dataset['pIC50']

# Initial train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

train = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/train.csv')
test = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/test.csv')

X_train = train['canonical_smiles']
y_train = train['pIC50']
X_test = test['canonical_smiles']
y_test = test['pIC50']

# Define the Regression model
class ChemBERTaForRegression(torch.nn.Module):
    def __init__(self, roberta_model):
        super(ChemBERTaForRegression, self).__init__()
        self.roberta = roberta_model
        # Freeze all layers of the roberta model
        #for param in self.roberta.parameters():
        #    param.requires_grad = False
        self.hidden1 = torch.nn.Linear(768, 128)
        self.batch_norm1 = torch.nn.BatchNorm1d(128)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.5)
        #self.hidden2 = torch.nn.Linear(128, 64)
        self.regressor = torch.nn.Linear(128, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        x = self.hidden1(pooler_output)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        #x = self.hidden2(x)
        regression_output = self.regressor(x)
        return regression_output



# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

# Define loss function and learning parameters
loss_fn = torch.nn.MSELoss()
learning_rate = 0.00001
num_epochs = 200

# KFold Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []

for train_index, valid_index in kf.split(X_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    # Create DataLoader for training fold
    input_encodings_train = tokenizer(list(X_train_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
    train_dataset = TensorDataset(input_encodings_train["input_ids"], input_encodings_train["attention_mask"], torch.tensor(list(y_train_fold)).unsqueeze(-1))
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    # Create DataLoader for validation fold
    input_encodings_valid = tokenizer(list(X_valid_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
    valid_dataset = TensorDataset(input_encodings_valid["input_ids"], input_encodings_valid["attention_mask"], torch.tensor(list(y_valid_fold)).unsqueeze(-1))
    valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

    # Initialize model and optimizer
    model = ChemBERTaForRegression(RobertaModel.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")).to("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
            input_ids_batch, attention_masks_batch, labels_batch = batch
            outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
            loss = loss_fn(outputs, labels_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in valid_dataloader:
            batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
            input_ids_batch, attention_masks_batch, labels_batch = batch
            outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
            all_preds.extend(outputs.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())

    mse_scores.append(mean_squared_error(all_labels, all_preds))

print("MSE scores from 5-fold cross validation:", mse_scores)
print("Mean Square Error: ", np.mean(mse_scores))

"""
# Now train on the entire training set
input_encodings_train = tokenizer(list(X_train), return_tensors="pt", padding=True, truncation=True, max_length=512)
train_dataset = TensorDataset(input_encodings_train["input_ids"], input_encodings_train["attention_mask"], torch.tensor(y_train).unsqueeze(-1))
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
"""



MSE scores from 5-fold cross validation: [4.8604307, 6.826259, 4.7729263, 5.6686225, 6.764546]
Mean Square Error:  5.778557


'\n# Now train on the entire training set\ninput_encodings_train = tokenizer(list(X_train), return_tensors="pt", padding=True, truncation=True, max_length=512)\ntrain_dataset = TensorDataset(input_encodings_train["input_ids"], input_encodings_train["attention_mask"], torch.tensor(y_train).unsqueeze(-1))\ntrain_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n'

In [None]:
import pandas as pd
from transformers import AutoTokenizer

# Read the datasets
train = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/train.csv')
test = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/test.csv')

X_train = train['canonical_smiles']
y_train = train['pIC50']
X_test = test['canonical_smiles']
y_test = test['pIC50']

# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

# Tokenize the data and retrieve token IDs
train_token_ids = [tokenizer.encode(smile, add_special_tokens=True, max_length=512, truncation=True) for smile in X_train]
test_token_ids = [tokenizer.encode(smile, add_special_tokens=True, max_length=512, truncation=True) for smile in X_test]

# Convert token IDs to strings for saving to CSV
train_token_strings = [' '.join(map(str, ids)) for ids in train_token_ids]
test_token_strings = [' '.join(map(str, ids)) for ids in test_token_ids]

# Create new dataframes
train_tokenized_df = pd.DataFrame({'tokens': train_token_strings, 'pIC50': y_train})
test_tokenized_df = pd.DataFrame({'tokens': test_token_strings, 'pIC50': y_test})

# Save to CSV
train_tokenized_df.to_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/train_tokenized.csv', index=False)
test_tokenized_df.to_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/test_tokenized.csv', index=False)

print("Tokenization and saving to CSV completed!")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

X_train_tokens = train_tokenized_df['tokens']
y_train = train_tokenized_df['pIC50']

X_test_tokens = test_tokenized_df['tokens']
y_test = test_tokenized_df['pIC50']

# Use CountVectorizer to transform the token lists
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), max_features=5000)  # limiting to 5000 most frequent tokens
X_train_bow = vectorizer.fit_transform(X_train_tokens)
X_test_bow = vectorizer.transform(X_test_tokens)

# Train a Random Forest regressor on the transformed data
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Cross-validation for MSE
cv_mse_scores = cross_val_score(regressor, X_train_bow, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -cv_mse_scores.mean()

# Cross-validation for R2
cv_r2_scores = cross_val_score(regressor, X_train_bow, y_train, cv=5, scoring='r2')
mean_cv_r2 = cv_r2_scores.mean()

print(f"Cross-validated MSE: {mean_cv_mse}")
print(f"Cross-validated R^2: {mean_cv_r2}")

# Fitting the model on full training data
regressor.fit(X_train_bow, y_train)

# Predictions
y_pred_train = regressor.predict(X_train_bow)
y_pred_test = regressor.predict(X_test_bow)

# Calculate MSE and R2 for train set
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Calculate MSE and R2 for test set
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"\nTrain MSE: {mse_train}")
print(f"Train R^2: {r2_train}")
print(f"\nTest MSE: {mse_test}")
print(f"Test R^2: {r2_test}")
