In [3]:
import numpy as np
import pandas as pd
import torch
import time
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,
                          TrainerCallback, EarlyStoppingCallback, AdamW)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.nn import CrossEntropyLoss

In [4]:
# Custom Callback for Epoch Timing
class EpochTimeCallback(TrainerCallback):
    """ Custom callback to print epoch training status with time tracking. """
    def __init__(self):
        self.epoch_start_time = None

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        print(f"\n🟢 Epoch {int(state.epoch) + 1}/{args.num_train_epochs} started...")

    def on_epoch_end(self, args, state, control, **kwargs):
        elapsed_time = time.time() - self.epoch_start_time
        print(f"✅ Epoch {int(state.epoch) + 1} completed in {elapsed_time:.2f} seconds.")

In [5]:
# Load Data
print("🚀 Loading data...")
df = pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
test = pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTest_raw.csv')
data = pd.concat([df, test])
print("✅ Data loaded successfully!")

# Label sentiment based on ratings
print("📝 Labeling sentiment...")
data.loc[data['rating'] >= 5, 'Review_Sentiment'] = 1
data.loc[data['rating'] < 5, 'Review_Sentiment'] = 0


🚀 Loading data...
✅ Data loaded successfully!
📝 Labeling sentiment...


In [6]:
# Handle missing values
data = data.dropna(subset=['review'])
print(f"🔍 Data cleaned! Remaining samples: {len(data)}")

# Train-test split
print("✂ Splitting data into training and validation sets...")
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['review'].tolist(), data['Review_Sentiment'].tolist(), test_size=0.2, random_state=42, stratify=data['Review_Sentiment']
)
print(f"✅ Data split: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Load BioBERT tokenizer
print("🔄 Loading BioBERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
print("✅ Tokenizer loaded!")

🔍 Data cleaned! Remaining samples: 215063
✂ Splitting data into training and validation sets...
✅ Data split: 172050 training samples, 43013 validation samples.
🔄 Loading BioBERT tokenizer...


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

✅ Tokenizer loaded!


In [7]:
# Tokenize data
print("⌛ Tokenizing training and validation data...")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)  # Reduced max_length for efficiency
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)
print("✅ Tokenization complete!")

⌛ Tokenizing training and validation data...
✅ Tokenization complete!


In [8]:
# Convert to PyTorch Dataset
class DrugReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)  # Fix: Use long dtype for CrossEntropyLoss
        return item

In [9]:
print("📦 Creating dataset class...")
train_dataset = DrugReviewDataset(train_encodings, train_labels)
val_dataset = DrugReviewDataset(val_encodings, val_labels)
print(f"📊 Training dataset: {len(train_dataset)} samples, Validation dataset: {len(val_dataset)} samples.")

# Load BioBERT model
print("🧠 Loading BioBERT model for sequence classification...")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)
print("✅ Model loaded!")

# Compute class weights for imbalanced dataset
print("⚖ Computing class weights...")
class_counts = np.bincount(train_labels)
weights = torch.tensor([1.0 / class_counts[0], 1.0 / class_counts[1]], dtype=torch.float32).to(model.device)
loss_fn = CrossEntropyLoss(weight=weights)

📦 Creating dataset class...
📊 Training dataset: 172050 samples, Validation dataset: 43013 samples.
🧠 Loading BioBERT model for sequence classification...


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded!
⚖ Computing class weights...


In [10]:
# Use AdamW optimizer with custom learning rate

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Lower LR and weight decay for better generalization



In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)  # Convert logits to class predictions
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [12]:
# Define training arguments
print("⚙ Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./biobert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,  # Keep only 3 best checkpoints
    load_best_model_at_end=True,
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Increased epochs from 3 to 5
    gradient_accumulation_steps=2,  # Helps stabilize training
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable W&B logs
    fp16=True,  # Enables mixed-precision training
    weight_decay=0.01,
    learning_rate=2e-5,
    warmup_ratio=0.1,  # Helps with convergence
    metric_for_best_model="accuracy"
)
print("✅ Training arguments set!")

⚙ Setting up training arguments...
✅ Training arguments set!




In [13]:
# Define Trainer with Early Stopping
print("📢 Initializing Trainer with live tracking...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[EpochTimeCallback(), EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping prevents overfitting
)
print("✅ Trainer initialized!")

📢 Initializing Trainer with live tracking...
✅ Trainer initialized!


In [14]:
# Train Model
print("🚀 Training started...")
trainer.train()
print("🎉 Training complete!")

# Evaluate Model
print("📊 Evaluating model performance on validation set...")
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)
accuracy = accuracy_score(val_labels, preds)

print(f"✅ Validation Accuracy: {accuracy:.4f}")
print("📋 Classification Report:")
print(classification_report(val_labels, preds))

🚀 Training started...

🟢 Epoch 1/5 started...


Epoch,Training Loss,Validation Loss


✅ Epoch 2 completed in 3957.28 seconds.


NameError: name 'precision_recall_fscore_support' is not defined

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Generate confusion matrix
conf_matrix = confusion_matrix(val_labels, preds)

# Plot confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(val_labels, preds)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line for reference
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()


NameError: name 'preds' is not defined

In [16]:
# Save the trained BioBERT model
model.save_pretrained("./saved_biobert_model")
tokenizer.save_pretrained("./saved_biobert_model")

print("✅ Model and tokenizer saved successfully!")


✅ Model and tokenizer saved successfully!
