In [52]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



# Read the cleaned dataset from CSV

In [53]:
try:
    reddit_data = pd.read_csv("processed_reddit_data.csv")  # Adjust path if necessary
    # Ensure necessary columns are present
    if "cleaned_title" not in reddit_data.columns:
        reddit_data["cleaned_title"] = reddit_data["title"].apply(preprocess_text)
    if "score" not in reddit_data.columns:
        raise ValueError("Dataset must contain a 'score' column for sentiment assignment.")
except FileNotFoundError:
    print("Error: File 'cleaned_reddit_data.csv' not found. Ensure the dataset is available.")
    raise

# Assign sentiment based on the score

In [54]:
reddit_data["sentiment"] = reddit_data["score"].apply(lambda x: 1 if x > 0 else 0)

#Split data into training and validation sets

In [55]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reddit_data["cleaned_title"], reddit_data["sentiment"], test_size=0.2, random_state=42
)

#Define the dataset class

In [56]:
class RedditDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


#Load tokenizer and model

In [57]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#Load tokenizer and model

In [58]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Prepare DataLoader

In [59]:
train_dataset = RedditDataset(list(train_texts), list(train_labels))
val_dataset = RedditDataset(list(val_texts), list(val_labels))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

#Define optimizer and training loop

In [61]:
optimizer = AdamW(model.parameters(), lr=5e-5)

#Train the model

In [66]:
def train(model, data_loader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(data_loader), correct / total

def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    all_preds = []  # To store all predictions
    all_labels = []  # To store all true labels

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)  # Get predicted class
            all_preds.extend(preds.cpu().numpy())  # Append predictions to list
            all_labels.extend(labels.cpu().numpy())  # Append true labels to list
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    # Calculate performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))


In [67]:
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")

print("Model training complete and saved!")

Model training complete and saved!


In [69]:
for epoch in range(3):
    train_loss, train_acc = train(model, train_loader)


In [70]:
# Evaluate the model on the validation dataset
print("Evaluating the model on the validation dataset...")
evaluate_model(model, val_loader)

Evaluating the model on the validation dataset...
Accuracy: 0.7018

Classification Report:
              precision    recall  f1-score   support

    Negative       0.14      0.29      0.19         7
    Positive       0.88      0.76      0.82        50

    accuracy                           0.70        57
   macro avg       0.51      0.52      0.50        57
weighted avg       0.79      0.70      0.74        57

