In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("drugLibTrain_raw.tsv", sep="\t")
# Use a small subset for fast training
df = df.sample(n=50, random_state=42)
# Use 'benefitsReview' and 'sideEffectsReview' as reviews and 'rating' for sentiment
df = df.dropna(subset=["benefitsReview", "sideEffectsReview", "rating"])

# Combine the review columns into a single 'review' column
df['review'] = df['benefitsReview'].fillna('') + ' ' + df['sideEffectsReview'].fillna('')


# Binary sentiment: rating ≥ 7 is positive (1), else negative (0)
df["label"] = df["rating"].apply(lambda x: 1 if x >= 7 else 0)

# Take text and labels
texts = df["review"].tolist()
labels = df["label"].tolist()

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Encode dataset
encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)


In [None]:
import torch

class DrugReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Train/test split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

train_dataset = DrugReviewDataset(train_encodings, train_labels)
test_dataset = DrugReviewDataset(test_encodings, test_labels)


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean(),
        "f1": f1_score(p.label_ids, p.predictions.argmax(-1))
    }
)

# Train the model
trainer.train()

In [None]:
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

In [None]:
import streamlit as st
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch.nn.functional as F

# Load model & tokenizer
model = DistilBertForSequenceClassification.from_pretrained("sentiment_model")
tokenizer = DistilBertTokenizerFast.from_pretrained("sentiment_model")

st.title(" Drug Review Sentiment Analyzer")

user_input = st.text_area("Enter a drug review:")

if st.button("Predict"):
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        confidence = probs.max().item()
        label = torch.argmax(probs).item()
        sentiment = "Positive" if label == 1 else "Negative"

    st.markdown(f"### Sentiment: **{sentiment}**")
    st.markdown(f"Confidence: `{confidence:.2f}`")
