In [2]:
#standard package
import csv
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import math
from tqdm.auto import tqdm



In [3]:
def create_dict_from_csv(file_path):
    result_dict = {}
    All_id = []
    All_features =[]
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if len(row) >= 2:
                cid = row[0].strip()
                feature = row[1].strip()
                result_dict[cid] = feature
                All_id.append(cid)
                All_features.append(feature)
    return result_dict, All_id, All_features


In [4]:
Drug_Data, Drug_cid,Drug_Smiles=create_dict_from_csv('drug_info.csv')
Drug_cid = Drug_cid[1:]
Drug_Smiles =Drug_Smiles[1:]
NUM_Drugs= len(Drug_Smiles)
print(Drug_cid[0],':',Drug_Smiles[0])


Protein_Data, Protein_name, Protein_AA=create_dict_from_csv('prot_info.csv')
Protein_name = Protein_name[1:]
Protein_AA =Protein_AA[1:]
NUM_Protein= len(Protein_AA)
print(Protein_name[0],':',Protein_AA[0])

print('Number of Drugs=', NUM_Drugs, 'Number of Proteins=', NUM_Protein)


11485656 : CC1=CC(=C(C=C1)F)NC(=O)NC2=CC=C(C=C2)C3=C4C(=CC=C3)NN=C4N
AAA61480.1|CLK1|CLK1 : MRHSKRTYCPDWDDKDWDYGKWRSSSSHKRRKRSHSSAQENKRCKYNHSKMCDSHYLESRSINEKDYHSRRYIDEYRNDYTQGCEPGHRQRDHESRYQNHSSKSSGRSGRSSYKSKHRIHHSTSHRRSHGKSHRRKRTRSVEDDEEGHLICQSGDVLSARYEIVDTLGEGAFGKVVECIDHKAGGRHVAVKIVKNVDRYCEAARSEIQVLEHLNTTDPNSTFRCVQMLEWFEHHGHICIVFELLGLSTYDFIKENGFLPFRLDHIRKMAYQICKSVNFLHSNKLTHTDLKPENILFVQSDYTEAYNPKIKRDERTLINPDIKVVDFGSATYDDEHHSTLVSTRHYRAPEVILALGWSQPCDVWSIGCILIEYYLGFTVFPTHDSKEHLAMMERILGPLPKHMIQKTRKRKYFHHDRLDWDEHSSAGRYVSRACKPLKEFMLSQDVEHERLFDLIQKMLEYDPAKRITLREALKHPFFDLLKKSI
Number of Drugs= 72 Number of Proteins= 442


In [5]:
def protein_embedding(protein_sequences):
    """
    Embed a list of protein sequences (strings of amino acids) using ProtBert.

    Args:
        protein_sequences (list of str): List of protein amino acid sequences.

    Returns:
        embeddings (torch.Tensor): Embeddings of shape (batch_size, sequence_length, hidden_size).
    """

    torch.cuda.empty_cache()
    # Load the ProtBert tokenizer and mode
    model_name = "Rostlab/prot_bert" #"facebook/esm2_t6_8M_UR50D"
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
    model = AutoModel.from_pretrained(model_name)


    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    #model.eval()

    max_seq_len = 512
    processed_sequences = []
    for seq in protein_sequences:
        seq_truncated = seq[:max_seq_len] # Apply truncation here
        processed_sequences.append(" ".join(list(seq_truncated)))

    batch_size = 8
    all_embeddings =[]
    num_Batches = math.ceil(NUM_Protein/batch_size)
    print(num_Batches)
    # Iterate through sequences in batches
    for i in tqdm(range(0, len(processed_sequences), batch_size), desc="Processing Batches"):
         #print(i)
         batch_sequences = processed_sequences[i:i + batch_size]
         # Tokenize the entire batch
         # padding='longest': Pads to the length of the longest sequence in the current batch.
         # truncation=True: Truncates sequences if they exceed the tokenizer's max input length (or max_seq_len if specified).
         inputs = tokenizer(batch_sequences,return_tensors="pt",padding='longest', truncation=True,
                            max_length=max_seq_len).to(device)

         with torch.no_grad():
           outputs = model(**inputs)
           # Get the last hidden states (batch_size, sequence_length, hidden_size)
           batch_embeddings = outputs.last_hidden_state
           #print("Protein embeddings size =", batch_embeddings.shape)

         # Move embeddings to CPU and append to the list
         # Important: Only move to CPU if you don't need them on GPU for subsequent steps.
         # Moving to CPU frees up GPU memory.
         all_embeddings.append(batch_embeddings.cpu())

    # Concatenate all batch embeddings into a single tensor
    # This will result in a tensor of shape (total_sequences, max_length_in_batch, hidden_size)
    # where max_length_in_batch is the maximum length of a sequence in *any* batch,
    # because we padded 'longest' within each batch. If you need consistent length across ALL batches,
    # you'd need to pad to `max_seq_len` for every batch.
    final_embeddings_tensor = torch.cat(all_embeddings, dim=0)

    print(f"\nTotal protein embeddings tensor shape: {final_embeddings_tensor.shape}")
    print(f"Example of the first embedding's shape: {all_embeddings[0].shape}")
    return final_embeddings_tensor

    #---------------------------------------------------------------------------
    '''
    # Prepare the sequences by adding spaces between amino acids as required by ProtBert tokenizer
    sequences_with_spaces = [" ".join(list(seq)) for seq in protein_sequences]

    # Tokenize sequences (pads & truncates automatically)
    inputs = tokenizer(sequences_with_spaces, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the last hidden state (embeddings)
    embeddings = outputs.last_hidden_state '''


Protein_full_embeddings = protein_embedding(Protein_AA)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


56


Processing Batches:   0%|          | 0/56 [00:00<?, ?it/s]


Total protein embeddings tensor shape: torch.Size([442, 512, 1024])
Example of the first embedding's shape: torch.Size([8, 512, 1024])


In [6]:
def drug_embedding(Drug_Smiles):
    tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MLM")
    model = AutoModel.from_pretrained("DeepChem/ChemBERTa-10M-MLM")
    #move to gpu
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    #tokenizer
    inputs = tokenizer(Drug_Smiles, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
      outputs = model(**inputs)
      #print(outputs)

    embeddings = outputs.last_hidden_state # Or pooler_output depending on your needs
    print("Drug embeding size=",embeddings.shape) # Should be (batch_size, sequence_length, hidden_size)
    return embeddings

Drug_full_embeddings = drug_embedding(Drug_Smiles)



Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Drug embeding size= torch.Size([72, 89, 384])


In [7]:
# --- 1. Read Data ---
csv_file_name ='Davis.csv'
raw_df = pd.read_csv(csv_file_name, header=None)
#print(f"\nLoaded raw CSV with shape: {raw_df.shape}")
#print("Raw CSV head:")
#print(raw_df.head())

# Extract drug CIDs from the first row (starting from the 5th column, index 4)
# Assuming the first 4 columns of the first row are empty or irrelevant for drug CIDs
drug_cids = raw_df.iloc[0, 4:].astype(str).tolist() # Convert to string to handle potential mixed types

# Extract protein identifiers from the 4th column (index 3) of rows 2 onwards (index 2 in 0-based)
# Assuming the combined ID is in the 4th column
protein_identifiers = raw_df.iloc[3:, 3].astype(str).tolist()
# Extract binding data (values start from row 2, column 4)
binding_data_matrix = raw_df.iloc[3:, 4:]

#print("CID=",drug_cids,'\n',"protein names=",protein_identifiers,'\n',binding_data_matrix)

# Convert wide format to long format
long_data = []
for p_idx, p_id in enumerate(protein_identifiers):
    for d_idx, d_cid in enumerate(drug_cids):
        binding_value = binding_data_matrix.iloc[p_idx, d_idx]
        if pd.notna(binding_value): # Only include non-NaN binding values
            long_data.append({
                'drug_id': d_cid,
                'protein_name': p_id, # Using the combined protein identifier as 'protein_name'
                'binding_db': float(binding_value)
            })
        else :
            long_data.append({
                'drug_id': d_cid,
                'protein_name': p_id, # Using the combined protein identifier as 'protein_name'
                'binding_db': float(10000)
            })

#print (long_data)
df_interactions = pd.DataFrame(long_data)
print(f"\nTransformed to long format DataFrame with {len(df_interactions)} interactions.")
print("First 5 rows of transformed interaction data:")
print(df_interactions.head())



Transformed to long format DataFrame with 31824 interactions.
First 5 rows of transformed interaction data:
    drug_id          protein_name  binding_db
0  11314340  AAA61480.1|CLK1|CLK1         1.4
1  10074640  AAA61480.1|CLK1|CLK1     10000.0
2  11485656  AAA61480.1|CLK1|CLK1      8900.0
3  24889392  AAA61480.1|CLK1|CLK1     10000.0
4   6450551  AAA61480.1|CLK1|CLK1     10000.0


In [8]:
# --- 2. Create Mappings from IDs/Names to Embedding Indices ---
# IMPORTANT ASSUMPTION: The order of drug CIDs extracted from the CSV
# matches the order of drugs in drug_full_embeddings (index 0 to 71).
# Similarly for protein identifiers and protein_full_embeddings.

drug_id_to_idx = {cid: i for i, cid in enumerate(drug_cids)}
protein_name_to_idx = {p_id: i for i, p_id in enumerate(protein_identifiers)}

# Map drug_id and protein_name in DataFrame to their respective indices
df_interactions['drug_idx'] = df_interactions['drug_id'].map(drug_id_to_idx)
df_interactions['protein_idx'] = df_interactions['protein_name'].map(protein_name_to_idx)

# Drop rows where mapping failed (if any drug_id or protein_name not found in embeddings)
df_interactions.dropna(subset=['drug_idx', 'protein_idx'], inplace=True)
df_interactions['drug_idx'] = df_interactions['drug_idx'].astype(int)
df_interactions['protein_idx'] = df_interactions['protein_idx'].astype(int)

print(f"\nAfter mapping, {len(df_interactions)} valid interactions remain.")
print("First 5 rows with mapped indices:")
print(df_interactions.head())


After mapping, 31824 valid interactions remain.
First 5 rows with mapped indices:
    drug_id          protein_name  binding_db  drug_idx  protein_idx
0  11314340  AAA61480.1|CLK1|CLK1         1.4         0            0
1  10074640  AAA61480.1|CLK1|CLK1     10000.0         1            0
2  11485656  AAA61480.1|CLK1|CLK1      8900.0         2            0
3  24889392  AAA61480.1|CLK1|CLK1     10000.0         3            0
4   6450551  AAA61480.1|CLK1|CLK1     10000.0         4            0


In [9]:
# --- 3. No Embedding Pooling for 2D CNN Input ---
# We will directly use the full 2D embedding matrices for drugs and proteins.
# Drug_full_embeddings: (num_drugs, drug_seq_len, drug_embed_dim) -> (72, 89, 384)
# Protein_full_embeddings: (num_proteins, protein_seq_len, protein_embed_dim) -> (442, 512, 1024)

print(f"\nUsing full 2D drug embeddings shape: {Drug_full_embeddings.shape}")
print(f"Using full 2D protein embeddings shape: {Protein_full_embeddings.shape}")



Using full 2D drug embeddings shape: torch.Size([72, 89, 384])
Using full 2D protein embeddings shape: torch.Size([442, 512, 1024])


In [10]:
# --- 4. Define a PyTorch Dataset Class ---
class DrugProteinDataset(Dataset):
    def __init__(self, df, drug_full_embeddings, protein_full_embeddings):
        """
        Args:
            df (pd.DataFrame): DataFrame containing 'drug_idx', 'protein_idx', and 'binding_db'.
            drug_full_embeddings (torch.Tensor): Full 2D drug embeddings (num_drugs, seq_len, embed_dim).
            protein_full_embeddings (torch.Tensor): Full 2D protein embeddings (num_proteins, seq_len, embed_dim).
        """
        self.df = df
        self.drug_embeddings_full = drug_full_embeddings
        self.protein_embeddings_full = protein_full_embeddings

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the row for the current index
        row = self.df.iloc[idx]

        # Get drug and protein indices
        drug_idx = row['drug_idx']
        protein_idx = row['protein_idx']

        # Retrieve the full 2D embedding matrices
        drug_embed_matrix = self.drug_embeddings_full[drug_idx]
        protein_embed_matrix = self.protein_embeddings_full[protein_idx]

        # Get the binding affinity value
        binding_affinity = torch.tensor(row['binding_db'], dtype=torch.float32)

        # For 2D CNN, you typically need an input channel dimension.
        # If your embeddings are already (Height, Width), a CNN expects (Channels, Height, Width).
        # We add a channel dimension of 1 if treating each as a single-channel image.
        drug_embed_matrix = drug_embed_matrix.unsqueeze(0) # Adds a channel dimension: (1, 89, 384)
        protein_embed_matrix = protein_embed_matrix.unsqueeze(0) # Adds a channel dimension: (1, 512, 1024)

        return drug_embed_matrix, protein_embed_matrix, binding_affinity


DTI_Dataset = DrugProteinDataset(df_interactions, Drug_full_embeddings, Protein_full_embeddings)
print(f"DTI Dataset have :{len(DTI_Dataset)} samples")
print(f"Example drug embedding matrix shape: {DTI_Dataset[0][0].shape}") # (1, 89, 384)
print(f"Example protein embedding matrix shape: {DTI_Dataset[0][1].shape}") # (1, 512, 1024)
print(f"Example binding affinity: {DTI_Dataset[0][2]}")

DTI Dataset have :31824 samples
Example drug embedding matrix shape: torch.Size([1, 89, 384])
Example protein embedding matrix shape: torch.Size([1, 512, 1024])
Example binding affinity: 1.399999976158142


In [11]:
# --- 5. Split into Train and Test Sets ---
train_size = int(0.8 * len(DTI_Dataset)) # 80% for training
test_size = len(DTI_Dataset) - train_size # Remaining for testing

train_dataset, test_dataset = random_split(DTI_Dataset, [train_size, test_size])

print(f"\nTrain set size: {len(train_dataset)} samples")
print(f"Test set size: {len(test_dataset)} samples")


# --- 6. Create PyTorch DataLoaders ---
batch_size_loader = 8 # Adjust based on your GPU memory. Keep it smaller for larger inputs.

train_loader = DataLoader(train_dataset, batch_size=batch_size_loader, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size_loader, shuffle=False)

print(f"\nTrain DataLoader created with batch size {batch_size_loader}.")
print(f"Test DataLoader created with batch size {batch_size_loader}.")



Train set size: 25459 samples
Test set size: 6365 samples

Train DataLoader created with batch size 8.
Test DataLoader created with batch size 8.


In [12]:
# --- 7. Define the 2D CNN Model in PyTorch ---
class DrugProteinCNN(nn.Module):
  def __init__(self,drug_input_shape, protein_input_shape):
    super(DrugProteinCNN, self).__init__()

    # Drug Branch CNN
    # Input shape: (Batch_size, 1, 89, 384)
    self.drug_cnn = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), padding='same'),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=(2, 2)), # Output: (Batch, 32, 44, 192) approx
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), padding='same'),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=(2, 2)) # Output: (Batch, 64, 22, 96) approx
        )
    # Calculate the flattened size after drug CNN
    # We need to pass a dummy tensor to determine the output size
    dummy_drug_input = torch.randn(1, *drug_input_shape) # (1, 1, 89, 384)
    dummy_drug_output = self.drug_cnn(dummy_drug_input)
    self.drug_flatten_size = dummy_drug_output.shape[1] * dummy_drug_output.shape[2] * dummy_drug_output.shape[3]
    print(f"Drug CNN flattened size: {self.drug_flatten_size}")

    # Protein Branch CNN
    # Input shape: (Batch_size, 1, 512, 1024)
    self.protein_cnn = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(5, 5), padding='same'),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=(2, 2)), # Output: (Batch, 32, 256, 512) approx
        nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), padding='same'),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=(2, 2)), # Output: (Batch, 64, 128, 256) approx
        #nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3), padding='same'),
        #nn.ReLU(),
        #nn.MaxPool2d(kernel_size=(2, 2)) # Output: (Batch, 256, 64, 128) approx
        )
    # Calculate the flattened size after protein CNN
    dummy_protein_input = torch.randn(1, *protein_input_shape) # (1, 1, 512, 1024)
    dummy_protein_output = self.protein_cnn(dummy_protein_input)
    self.protein_flatten_size = dummy_protein_output.shape[1] * dummy_protein_output.shape[2] * dummy_protein_output.shape[3]
    print(f"Protein CNN flattened size: {self.protein_flatten_size}")

    # Fully Connected Layers for combined features
    self.fc_combined = nn.Sequential(
        nn.Linear(self.drug_flatten_size + self.protein_flatten_size, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(128, 1) # Output a single binding affinity value
     )

  def forward(self, drug_matrices, protein_matrices):
      # Process drug matrices
      drug_features = self.drug_cnn(drug_matrices)
      drug_features = torch.flatten(drug_features, 1) # Flatten for FC layers

      # Process protein matrices
      protein_features = self.protein_cnn(protein_matrices)
      protein_features = torch.flatten(protein_features, 1) # Flatten for FC layers

      # Concatenate features from both branches
      combined_features = torch.cat((drug_features, protein_features), dim=1)

      # Pass through fully connected layers
      output = self.fc_combined(combined_features)
      return output




In [14]:
sample_drug_matrix, sample_protein_matrix, _ = DTI_Dataset[0]
drug_input_shape = sample_drug_matrix.shape # (1, 89, 384)
protein_input_shape = sample_protein_matrix.shape # (1, 512, 1024)

model = DrugProteinCNN(drug_input_shape, protein_input_shape)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"\nModel instantiated and moved to {device}.")
print(model)

Drug CNN flattened size: 135168
Protein CNN flattened size: 1048576

Model instantiated and moved to cuda.
DrugProteinCNN(
  (drug_cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (protein_cnn): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=same)
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=1183744, out_features=256, 

In [1]:
# --- 8. Training Loop ---
# Loss function for regression (Mean Squared Error)
criterion = nn.MSELoss()
# Optimizer (Adam is a good general choice)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 2

print("\n--- Starting Training ---")
for epoch in range(num_epochs):
    model.train() # Set model to training mode
    running_loss = 0.0
    for drug_matrices, protein_matrices, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training"):
        # Move data to the same device as the model
        drug_matrices = drug_matrices.to(device)
        protein_matrices = protein_matrices.to(device)
        targets = targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(drug_matrices, protein_matrices)

        # Calculate loss
        loss = criterion(outputs.squeeze(), targets) # .squeeze() to remove singleton dimension from output

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * drug_matrices.size(0) # Accumulate loss

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}")


NameError: name 'nn' is not defined