In [20]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.metrics import accuracy_score
import torch.nn as nn


# Read the cleaned dataset from CSV

In [21]:
try:
    reddit_data = pd.read_csv("processed_reddit_data.csv")  # Adjust path if necessary
    # Ensure necessary columns are present
    if "cleaned_title" not in reddit_data.columns:
        reddit_data["cleaned_title"] = reddit_data["title"].apply(preprocess_text)
    if "score" not in reddit_data.columns:
        raise ValueError("Dataset must contain a 'score' column for sentiment assignment.")
except FileNotFoundError:
    print("Error: File 'cleaned_reddit_data.csv' not found. Ensure the dataset is available.")
    raise

# Assign sentiment based on the score

In [22]:
reddit_data["sentiment"] = reddit_data["score"].apply(lambda x: 1 if x > 0 else 0)

#Split data into training and validation sets

In [23]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reddit_data["cleaned_title"], reddit_data["sentiment"], test_size=0.2, random_state=42
)

#Define the dataset class

In [24]:
class RedditDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


#Load tokenizer and model

In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Prepare DataLoader

In [27]:
train_dataset = RedditDataset(list(train_texts), list(train_labels))
val_dataset = RedditDataset(list(val_texts), list(val_labels))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

#Define optimizer and training loop

In [28]:
optimizer = AdamW(model.parameters(), lr=5e-5)



#Train the model

In [29]:
def train(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(data_loader), correct / total

In [30]:
def evaluate_model_with_metrics(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Generate evaluation metrics
    conf_matrix = confusion_matrix(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    precision = precision_score(all_labels, all_preds, average="weighted")
    report = classification_report(all_labels, all_preds, target_names=["positive", "negative"], digits=4)

    return conf_matrix, report, f1, recall, precision


In [31]:
for epoch in range(10):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    conf_matrix, report, f1, recall, precision = evaluate_model_with_metrics(model, val_loader, device)

    print(f"Epoch {epoch+1}:")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print("Validation Metrics:")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(report)
    print(f"F1 Score: {f1:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")
    print("-" * 60)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1:
Train Loss: 0.5323, Train Accuracy: 0.7357
Validation Metrics:
Confusion Matrix:
[[ 0  7]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

    positive     0.0000    0.0000    0.0000         7
    negative     0.8772    1.0000    0.9346        50

    accuracy                         0.8772        57
   macro avg     0.4386    0.5000    0.4673        57
weighted avg     0.7695    0.8772    0.8198        57

F1 Score: 0.8198, Recall: 0.8772, Precision: 0.7695
------------------------------------------------------------
Epoch 2:
Train Loss: 0.4811, Train Accuracy: 0.7753
Validation Metrics:
Confusion Matrix:
[[ 1  6]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

    positive     1.0000    0.1429    0.2500         7
    negative     0.8929    1.0000    0.9434        50

    accuracy                         0.8947        57
   macro avg     0.9464    0.5714    0.5967        57
weighted avg     0.9060 

In [13]:
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")

print("Model training complete and saved!")

Model training complete and saved!


In [35]:
def test_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate evaluation metrics
    conf_matrix = confusion_matrix(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    precision = precision_score(all_labels, all_preds, average="weighted")
    report = classification_report(all_labels, all_preds, target_names=["positive", "negative"], digits=4)

    print("Test Metrics:")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(report)
    print(f"F1 Score: {f1:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")
    print("-" * 60)

    return conf_matrix, report, f1, recall, precision


In [36]:
conf_matrix, report, f1, recall, precision = test_model(model, val_loader, device)


Test Metrics:
Confusion Matrix:
[[ 0  7]
 [ 1 49]]
Classification Report:
              precision    recall  f1-score   support

    positive     0.0000    0.0000    0.0000         7
    negative     0.8750    0.9800    0.9245        50

    accuracy                         0.8596        57
   macro avg     0.4375    0.4900    0.4623        57
weighted avg     0.7675    0.8596    0.8110        57

F1 Score: 0.8110, Recall: 0.8596, Precision: 0.7675
------------------------------------------------------------
