In [57]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import json

# Set the device to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set a random seed for reproducibility
seed_val = 17
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
np.random.seed(seed_val)


In [58]:
device

device(type='cpu')

In [81]:



# Load your dataset
with open(r'C:\Users\8897p\OneDrive\Desktop\AI\IdeaCompilation\data_split_9.json', encoding='utf-8') as f:
    data = json.load(f)
    # Assuming 'data' is a dictionary that includes a key 'data' which is a list of dictionaries
    df = pd.DataFrame(data['data'])



In [82]:
df.head()


Unnamed: 0,toxicity,snp-adverse-drug-reactions,average-mass,transporters,pharmacodynamics,msds,description,enzymes,type,cas-number,...,monoisotopic-mass,volume-of-distribution,route-of-elimination,name,metabolism,reactions,indication,ahfs-codes,updated,drugbank-id
0,Highest dose tested was 360mg/kg/day in rats r...,,,,Hereditary antithrombin deficiency causes an i...,,Antithrombin Alfa is a recombinant antithrombi...,,biotech,84720-88-7,...,,Dose of:\r\n50IU/kg: 126.2 ml/kg\r\n100IU/kg: ...,Irreversible complexes formed between antithro...,Antithrombin Alfa,Not metabolized.,,Antithrombin alfa is a recombinant antithrombi...,,2021-02-21,DB11166
1,Calcium L-threonate has low oral acute toxicit...,,310.268,,"In a preclinical study, calcium L-theronate in...",//s3-us-west-2.amazonaws.com/drugbank/msds/DB1...,Calcium threonate is a calcium salt of threnoi...,,small molecule,70753-61-6,...,310.021288,The apparent total volume of distribution foll...,The presence of threonic acid in human urine h...,Calcium threonate,,,No approved therapeutic indications.,,2020-06-12,DB11168
2,,,,,,,Ferrous asparto glycinate is an iron-amino aci...,,small molecule,,...,,,,Ferrous asparto glycinate,,,,,2021-09-28,DB11169
3,Ferric sulfate has been proven to be an irrita...,,489.93,,The administration of ferric sulfate as a derm...,,Ferric sulfate has the molecular formula of Fe...,,small molecule,10028-22-5,...,489.777884,Pharmacokinetic studies related to the volume ...,Pharmacokinetic studies related to the elimina...,Ferric sulfate,Pharmacokinetic studies related to the metabol...,,Ferric sulfate was first used in dermatology a...,,2021-02-21,DB11171
4,,,,,,,Sesame oil is a commonly used vegetable oil in...,,biotech,8008-74-0,...,,,,Sesame oil,,,,,2021-02-21,DB11172


In [60]:

# Preprocess your data: Map 'drug-interactions' to a binary variable (1 if there are interactions listed, 0 otherwise)
df['Interaction'] = df['drug-interactions'].apply(lambda x: 0 if x is None else 1)

# Tokenize drug names using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   toxicity                    218 non-null    object 
 1   snp-adverse-drug-reactions  0 non-null      object 
 2   average-mass                981 non-null    float64
 3   transporters                83 non-null     object 
 4   pharmacodynamics            245 non-null    object 
 5   msds                        119 non-null    object 
 6   description                 1116 non-null   object 
 7   enzymes                     144 non-null    object 
 8   type                        1196 non-null   object 
 9   cas-number                  1151 non-null   object 
 10  sequences                   23 non-null     object 
 11  targets                     270 non-null    object 
 12  unii                        1187 non-null   object 
 13  protein-binding             180 n

In [61]:
df.head()

Unnamed: 0,toxicity,snp-adverse-drug-reactions,average-mass,transporters,pharmacodynamics,msds,description,enzymes,type,cas-number,...,volume-of-distribution,route-of-elimination,name,metabolism,reactions,indication,ahfs-codes,updated,drugbank-id,Interaction
0,Highest dose tested was 360mg/kg/day in rats r...,,,,Hereditary antithrombin deficiency causes an i...,,Antithrombin Alfa is a recombinant antithrombi...,,biotech,84720-88-7,...,Dose of:\r\n50IU/kg: 126.2 ml/kg\r\n100IU/kg: ...,Irreversible complexes formed between antithro...,Antithrombin Alfa,Not metabolized.,,Antithrombin alfa is a recombinant antithrombi...,,2021-02-21,DB11166,1
1,Calcium L-threonate has low oral acute toxicit...,,310.268,,"In a preclinical study, calcium L-theronate in...",//s3-us-west-2.amazonaws.com/drugbank/msds/DB1...,Calcium threonate is a calcium salt of threnoi...,,small molecule,70753-61-6,...,The apparent total volume of distribution foll...,The presence of threonic acid in human urine h...,Calcium threonate,,,No approved therapeutic indications.,,2020-06-12,DB11168,0
2,,,,,,,Ferrous asparto glycinate is an iron-amino aci...,,small molecule,,...,,,Ferrous asparto glycinate,,,,,2021-09-28,DB11169,0
3,Ferric sulfate has been proven to be an irrita...,,489.93,,The administration of ferric sulfate as a derm...,,Ferric sulfate has the molecular formula of Fe...,,small molecule,10028-22-5,...,Pharmacokinetic studies related to the volume ...,Pharmacokinetic studies related to the elimina...,Ferric sulfate,Pharmacokinetic studies related to the metabol...,,Ferric sulfate was first used in dermatology a...,,2021-02-21,DB11171,1
4,,,,,,,Sesame oil is a commonly used vegetable oil in...,,biotech,8008-74-0,...,,,Sesame oil,,,,,2021-02-21,DB11172,0


In [62]:
# Check for any NaN values in the 'name' column
print(df['name'].isna().sum())

4


In [63]:
df['name'].fillna('unknown', inplace=True)
print(df['name'].isna().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['name'].fillna('unknown', inplace=True)


In [64]:
# Tokenize all drug names in the DataFrame
input_ids = []
attention_masks = []

# Encode each row in the DataFrame
for _, row in df.iterrows():
    encoded_dict = tokenizer.encode_plus(
                        row['name'],                      # Text to encode.
                        add_special_tokens=True,          # Add '[CLS]' and '[SEP]'
                        max_length=64,                    # Pad & truncate all sentences.
                        padding='max_length',             # Pad to max_length
                        truncation=True,                  # Truncate to max_length
                        return_attention_mask=True,       # Construct attn. masks.
                        return_tensors='pt',              # Return PyTorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])


In [65]:


# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['Interaction'].values)

# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=seed_val, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=seed_val, test_size=0.1)

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

In [21]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [83]:


# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
total_steps = len(train_dataloader) * 5 # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [84]:


# Define the accuracy measure function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
epochs = 5
for epoch in range(epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')
    model.train()

    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print('Average training loss: {0:.2f}'.format(avg_train_loss))
    print('Running Validation...')
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_eval_loss += loss.item()
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('Accuracy: {0:.2f}'.format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print('Validation Loss: {0:.2f}'.format(avg_val_loss))

print('Training complete!')


Training...
Average training loss: 0.31
Running Validation...
Accuracy: 0.64
Validation Loss: 0.93
Training...
Average training loss: 0.25
Running Validation...
Accuracy: 0.64
Validation Loss: 1.29
Training...
Average training loss: 0.16
Running Validation...
Accuracy: 0.64
Validation Loss: 1.18
Training...
Average training loss: 0.10
Running Validation...
Accuracy: 0.65
Validation Loss: 1.38
Training...
Average training loss: 0.12
Running Validation...
Accuracy: 0.66
Validation Loss: 1.46
Training complete!


In [72]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to evaluate the model on the validation set
def evaluate(model, validation_dataloader):
    model.eval()  # Put the model in evaluation mode
    predictions, true_labels = [], []

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    # Flatten the lists
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    # Convert logits to predicted labels
    pred_labels = np.argmax(predictions, axis=1)

    # Calculate precision, recall, and F1 score
    precision = precision_score(true_labels, pred_labels, zero_division=0)
    recall = recall_score(true_labels, pred_labels, zero_division=0)
    f1 = f1_score(true_labels, pred_labels, zero_division=0)


    return precision, recall, f1

# Calculate evaluation metrics
precision, recall, f1 = evaluate(model, validation_dataloader)
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))



Precision: 0.5536
Recall: 0.6078
F1 Score: 0.5794


In [78]:
def predict_interaction(model, drug_name, tokenizer):
    model.eval()  # Put the model in evaluation mode

    # Encode the drug name
    encoded_dict = tokenizer.encode_plus(
        drug_name,                      # Drug name to encode.
        add_special_tokens=True,        # Add '[CLS]' and '[SEP]'
        max_length=64,                  # Pad & truncate all names.
        pad_to_max_length=True,
        return_attention_mask=True,     # Construct attention masks.
        return_tensors='pt',            # Return pytorch tensors.
    )

    # Move tensors to the appropriate device
    input_id = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

    logits = outputs.logits.detach().cpu().numpy()
    predicted_label = np.argmax(logits, axis=1)[0]

    return 'Has Interaction' if predicted_label == 1 else 'No Interaction'

# Example usage
drug_name = "Flurbiprofen axetil"
prediction = predict_interaction(model, drug_name, tokenizer)
print(f"Prediction for {drug_name}: {prediction}")


Prediction for Flurbiprofen axetil: No Interaction




In [77]:
import gradio as gr
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Function to predict the interaction between two drugs
def predict_interaction(drug1, drug2):
    # Load the model and tokenizer
    model = BertForSequenceClassification.from_pretrained("/path/to/your/saved/model")
    tokenizer = BertTokenizer.from_pretrained("/path/to/your/saved/model")

    # Prepare the inputs for the model
    encoded_drug_pair = tokenizer.encode_plus(
        text=drug1,
        text_pair=drug2,
        add_special_tokens=True,
        max_length=256,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_drug_pair['input_ids']
    attention_mask = encoded_drug_pair['attention_mask']

    # Make the prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = outputs[0].argmax().item()

    # Map the prediction to the corresponding label
    label_mapping = {0: "No Interaction", 1: "Interaction"}
    return label_mapping[prediction]

# Create the Gradio interface
iface = gr.Interface(
    fn=predict_interaction,
    inputs=[gr.inputs.Textbox(label="Drug 1"), gr.inputs.Textbox(label="Drug 2")],
    outputs=gr.outputs.Textbox(label="Prediction")
)

# Launch the app
iface.launch()


AttributeError: module 'gradio' has no attribute 'inputs'