In [None]:
!pip install datasets -q

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_classes=3):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)  # 3-class output

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output  # Take [CLS] token output
        x = self.drop(pooled_output)
        return self.fc(x)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
import pandas as pd
test_path = '/content/twitter_validation.csv'
train_path = '/content/twitter_training.csv'

column_names = ['Tweet', 'Entity', 'Sentiment', 'Data']

train = pd.read_csv(train_path, header=None, names=column_names)
test = pd.read_csv(test_path, header=None, names=column_names)

train = train[train['Sentiment'] != 'Irrelevant']
train = train.dropna()
test = test[test['Sentiment'] != 'Irrelevant']
test = test.dropna()

train['Label'] = label_encoder.fit_transform(train['Sentiment'])
test['Label'] = label_encoder.fit_transform(test['Sentiment'])
test.head()


Unnamed: 0,Tweet,Entity,Sentiment,Data,Label
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,1
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,0
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",0
4,4433,Google,Neutral,Now the President is slapping Americans in the...,1
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,0


In [None]:
length = int(train.shape[0]*.5)
train_texts, train_labels = train['Data'][:length].to_list(), train['Label'][:length].to_list()

test_texts, test_labels = test['Data'].to_list(), test['Label'].to_list()
train_texts[0], train_labels[0]

('im getting on borderlands and i will murder you all ,', 2)

In [None]:
# import pandas as pd
# df = pd.read_csv('/content/twitter_training.csv')
# df.shape

In [None]:
# Create DataLoaders
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Initialize Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [None]:
# Training Loop
def train(model, train_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
    print("done")

# Evaluation Function
def evaluate(model, test_loader):
    model.eval()
    predictions, actual_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            actual_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(actual_labels, predictions)
    print(f"Test Accuracy: {acc:.4f}")

In [None]:
# Train and Evaluate
train(model, train_loader, criterion, optimizer, epochs=3)

Epoch 1, Loss: 0.5421
Epoch 2, Loss: 0.1748
Epoch 3, Loss: 0.0914
done


In [None]:
evaluate(model, test_loader)

Test Accuracy: 0.8309
