In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import re
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))            #LATER

[nltk_data] Downloading package stopwords to /Users/amit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Load dataset
df = pd.read_csv("./datasets/sentimentdataset.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


In [17]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    #text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [18]:
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"] = df["Sentiment"].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ProcessedText"] = df["Text"].apply(preprocess_text)


Unnamed: 0,Text,Sentiment,ProcessedText
0,Enjoying a beautiful day at the park! ...,Positive,enjoying beautiful day park!
1,Traffic was terrible this morning. ...,Negative,traffic terrible morning.
2,Just finished an amazing workout! 💪 ...,Positive,finished amazing workout! 💪
3,Excited about the upcoming weekend getaway! ...,Positive,excited upcoming weekend getaway!
4,Trying out a new recipe for dinner tonight. ...,Neutral,trying new recipe dinner tonight.
...,...,...,...
727,Collaborating on a science project that receiv...,Happy,collaborating science project received recogni...
728,Attending a surprise birthday party organized ...,Happy,attending surprise birthday party organized fr...
729,Successfully fundraising for a school charity ...,Happy,successfully fundraising school charity initia...
730,"Participating in a multicultural festival, cel...",Happy,"participating multicultural festival, celebrat..."


In [19]:
# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])


Unnamed: 0,Text,Sentiment,ProcessedText,SentimentEncoded
0,Enjoying a beautiful day at the park! ...,Positive,enjoying beautiful day park!,146
1,Traffic was terrible this morning. ...,Negative,traffic terrible morning.,134
2,Just finished an amazing workout! 💪 ...,Positive,finished amazing workout! 💪,146
3,Excited about the upcoming weekend getaway! ...,Positive,excited upcoming weekend getaway!,146
4,Trying out a new recipe for dinner tonight. ...,Neutral,trying new recipe dinner tonight.,135
...,...,...,...,...
727,Collaborating on a science project that receiv...,Happy,collaborating science project received recogni...,93
728,Attending a surprise birthday party organized ...,Happy,attending surprise birthday party organized fr...,93
729,Successfully fundraising for a school charity ...,Happy,successfully fundraising school charity initia...,93
730,"Participating in a multicultural festival, cel...",Happy,"participating multicultural festival, celebrat...",93


In [20]:
# Remove rare classes (classes with only 1 sample)
class_counts = df["SentimentEncoded"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["SentimentEncoded"].isin(rare_classes)]
df

Unnamed: 0,Text,Sentiment,ProcessedText,SentimentEncoded
0,Enjoying a beautiful day at the park! ...,Positive,enjoying beautiful day park!,146
1,Traffic was terrible this morning. ...,Negative,traffic terrible morning.,134
2,Just finished an amazing workout! 💪 ...,Positive,finished amazing workout! 💪,146
3,Excited about the upcoming weekend getaway! ...,Positive,excited upcoming weekend getaway!,146
4,Trying out a new recipe for dinner tonight. ...,Neutral,trying new recipe dinner tonight.,135
...,...,...,...,...
727,Collaborating on a science project that receiv...,Happy,collaborating science project received recogni...,93
728,Attending a surprise birthday party organized ...,Happy,attending surprise birthday party organized fr...,93
729,Successfully fundraising for a school charity ...,Happy,successfully fundraising school charity initia...,93
730,"Participating in a multicultural festival, cel...",Happy,"participating multicultural festival, celebrat...",93


In [22]:
# Split dataset
X = df["ProcessedText"]
y = label_encoder.fit_transform(df["Sentiment"])  # Re-encode after removing rare classes
y

array([ 88,  82,  88,  88,  83,  88,  88,  88,  82,  83,  88,  82,  88,
        88,  83,  88,  88,  88,  83,  82,  88,  88,  88,  88,  88,  88,
        88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,
        88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,
         8,  51,  99,  36,  60,  75,  79,   7,  44,   2,   5,  13,  34,
       104,   0,   3,   9,  16,  19,  24,  49,  76,  89, 102,   8,  51,
        99,  36,  60,  75,  79,   7,  44,   2,   5,  13,  34, 104,   0,
         3,   9,  16,  19,  24,  49,  76,  89, 102,   0,  49,  16,  19,
        24,  49,  76,  89, 102,  38,  48,  26, 101,  58,  66,  42,  21,
       105,  12,  45,  56,  97,  38,  48,  26, 101,  58,  66,  42,  21,
       105,  12,  45,  56,  97,  38,  38,  48,  26, 101,  58,  66,  42,
        21, 105,  12,  45,  56,  97,  38,  48,  26, 101,  58,  66,  42,
        21, 105,  12,  45,  56,  97,  38,  31,  59,  77,  74,  95,  55,
        18,  10,  71,  65,  47,  93,  36,  31,  59,  77,  74,  9

### Word representation / Embedding layer

In [24]:
# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)  # Increase features for better representation
X_tfidf = vectorizer.fit_transform(X).toarray()
X_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(653, 6248))

In [32]:
# Apply SMOTE to balance classes
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42, k_neighbors=1)
X_tfidf, y = smote.fit_resample(X_tfidf, y)

# Split after SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

(3588, 6248)

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)
X_train_tensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [27]:
# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [28]:
# Define the PyTorch MLP model
class SentimentMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SentimentMLP, self).__init__()
        
        # First hidden layer
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.4)                         #LATER
        )
        
        # Second hidden layer
        self.layer2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4)
        )
        
        # Third hidden layer
        self.layer3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        # Output layer
        self.output_layer = nn.Linear(256, num_classes)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output_layer(x)
        return x

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [33]:
# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
num_classes = len(set(y_train))
model = SentimentMLP(input_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [None]:
# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients because we want each batch to function independently of the other
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()             #Compute the gradients wrt the loss above
        optimizer.step()            #Update the model parameters based on the above computed gradients
        
        # Track statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

# Evaluation function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Track statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store predictions and labels for classification report
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc, all_preds, all_labels

In [19]:
# Train the PyTorch model
epochs = 30
best_acc = 0.0

for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc, _, _ = evaluate(model, test_loader, criterion, device)
    
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
          f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
    
    # Save best model
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), 'best_sentiment_model.pth')

# Load the best model for final evaluation
model.load_state_dict(torch.load('best_sentiment_model.pth'))
_, pytorch_acc, all_preds, all_labels = evaluate(model, test_loader, criterion, device)

print(f"PyTorch MLP Accuracy: {pytorch_acc:.4f}")
print("PyTorch Classification Report:\n", classification_report(all_labels, all_preds))

Epoch 1/30, Train Loss: 4.7159, Train Acc: 0.0071, Test Loss: 4.7112, Test Acc: 0.0113
Epoch 2/30, Train Loss: 4.7081, Train Acc: 0.0082, Test Loss: 4.7037, Test Acc: 0.0102
Epoch 3/30, Train Loss: 4.7009, Train Acc: 0.0102, Test Loss: 4.6969, Test Acc: 0.0102
Epoch 4/30, Train Loss: 4.6943, Train Acc: 0.0099, Test Loss: 4.6906, Test Acc: 0.0102
Epoch 5/30, Train Loss: 4.6879, Train Acc: 0.0102, Test Loss: 4.6840, Test Acc: 0.0102
Epoch 6/30, Train Loss: 4.6814, Train Acc: 0.0085, Test Loss: 4.6775, Test Acc: 0.0102
Epoch 7/30, Train Loss: 4.6747, Train Acc: 0.0099, Test Loss: 4.6710, Test Acc: 0.0102
Epoch 8/30, Train Loss: 4.6682, Train Acc: 0.0108, Test Loss: 4.6644, Test Acc: 0.0102
Epoch 9/30, Train Loss: 4.6618, Train Acc: 0.0077, Test Loss: 4.6578, Test Acc: 0.0102
Epoch 10/30, Train Loss: 4.6552, Train Acc: 0.0116, Test Loss: 4.6515, Test Acc: 0.0125
Epoch 11/30, Train Loss: 4.6493, Train Acc: 0.0102, Test Loss: 4.6451, Test Acc: 0.0102
Epoch 12/30, Train Loss: 4.6431, Train Ac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
