In [1]:
!pip install torch torchtext scikit-learn datasets -q

In [2]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [3]:
import pandas as pd
test_path = '/content/twitter_validation.csv'
train_path = '/content/twitter_training.csv'

column_names = ['Tweet', 'Entity', 'Sentiment', 'Data']

train = pd.read_csv(train_path, header=None, names=column_names)
test = pd.read_csv(test_path, header=None, names=column_names)

train = train[train['Sentiment'] != 'Irrelevant']
train = train.dropna()
test = test[test['Sentiment'] != 'Irrelevant']
test = test.dropna()

train['Label'] = label_encoder.fit_transform(train['Sentiment'])
test['Label'] = label_encoder.fit_transform(test['Sentiment'])
test.head()


Unnamed: 0,Tweet,Entity,Sentiment,Data,Label
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,1
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,0
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",0
4,4433,Google,Neutral,Now the President is slapping Americans in the...,1
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,0


In [9]:
length = int(train.shape[0]*.5)
train_texts, train_labels = train['Data'][:length].to_list(), train['Label'][:length].to_list()

test_texts, test_labels = test['Data'].to_list(), test['Label'].to_list()
train_texts[0], train_labels[0]

('im getting on borderlands and i will murder you all ,', 2)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Simple Tokenizer
def simple_tokenizer(text):
    return text.lower().split()

# Build Vocabulary
all_tokens = [simple_tokenizer(text) for text in train_texts+test_texts]
unique_tokens = set(token for tokens in all_tokens for token in tokens)
vocab = {word: idx + 1 for idx, word in enumerate(unique_tokens)}
vocab["<PAD>"] = 0  # Padding Token

# Hyperparameters
embedding_dim = 50  # Dimension of word embeddings
hidden_dim = 64
output_dim = 3
max_len = 20  # Maximum sequence length

# Custom Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer, max_len=20):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])[:self.max_len]
        input_ids = [self.vocab[token] if token in self.vocab else 0 for token in tokens]
        input_ids += [0] * (self.max_len - len(input_ids))  # Padding
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Split Dataset
# train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load Data
train_dataset = SentimentDataset(train_texts, train_labels, vocab, simple_tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, vocab, simple_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [11]:

# Define Simple Neural Network Model
class NN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(NN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)  # Convert words to embeddings
        x = x.mean(dim=1)  # Average pooling
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x)  # No Softmax (CrossEntropyLoss applies it)

# Initialize Model
vocab_size = len(vocab)
model = NN(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Function
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Evaluation Function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, actual_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            actual_labels.extend(labels.cpu().numpy())

    accuracy = np.mean(np.array(predictions) == np.array(actual_labels))
    print(f"Test Accuracy: {accuracy:.4f}")



In [12]:
# Train & Evaluate
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)

Epoch 1/10, Loss: 0.9498
Epoch 2/10, Loss: 0.5930
Epoch 3/10, Loss: 0.3579
Epoch 4/10, Loss: 0.2288
Epoch 5/10, Loss: 0.1697
Epoch 6/10, Loss: 0.1317
Epoch 7/10, Loss: 0.1077
Epoch 8/10, Loss: 0.0860
Epoch 9/10, Loss: 0.0810
Epoch 10/10, Loss: 0.0714
Test Accuracy: 0.5435
