In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
import torchtext
import torchtext.vocab as vocab
import os

# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Use smaller dimension for faster processing, options: 50, 100, 200, 300
glove_dim = 100
print(f"Loading GloVe embeddings ({glove_dim}d)...")
# Create cache directory if it doesn't exist
os.makedirs('.vector_cache', exist_ok=True)
# Load GloVe vectors
glove = vocab.GloVe(name='6B', dim=glove_dim, cache='.vector_cache')
print(f"Loaded {len(glove.stoi)} word vectors")

[nltk_data] Downloading package stopwords to /Users/amit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/amit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading GloVe embeddings (100d)...
Loaded 400000 word vectors


In [33]:
# Load dataset
df = pd.read_csv("./datasets/sentimentdataset.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


Cell 2: Load dataset and define preprocessing

In [34]:
# Define text preprocessing function with improved cleaning
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    # More comprehensive cleaning
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#(\w+)', r'\1', text)  # Keep hashtag content without # symbol
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    
    # More advanced tokenization and lemmatization
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
    return ' '.join(words)

Cell 3: Preprocess text and feature engineering


In [35]:
# Preprocess text data
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Extract hashtags as a feature
def extract_hashtags(hashtag_str):
    if pd.isna(hashtag_str) or hashtag_str == '':
        return []
    return hashtag_str.split()

df['HashtagsList'] = df['Hashtags'].apply(extract_hashtags)
df['HashtagCount'] = df['HashtagsList'].apply(len)

# Feature engineering from datetime
if 'Timestamp' in df.columns:                                                                                           #LATER
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y %H:%M', errors='coerce')                         #ERRORS=COERCE MEANS THAT IF CONVERSION FAILS FOR ANY VALUE, IT WILL BE SET TO NAT (NOT A TIME)
    
    # If timestamp conversion failed, try to use the individual date/time columns
    if df['Timestamp'].isna().any() and all(col in df.columns for col in ['Year', 'Month', 'Day', 'Hour']):
        df['Timestamp'] = pd.to_datetime(
            df[['Year', 'Month', 'Day', 'Hour']].fillna(0).astype(int),
            format='%Y%m%d%H',
            errors='coerce'
        )
    
    # Extract time features
    df['Hour'] = df['Timestamp'].dt.hour
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
    df['Month'] = df['Timestamp'].dt.month
    df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # 5,6 = weekend
    df['TimeOfDay'] = df['Hour'].apply(lambda h: 
                                     'Morning' if 5 <= h < 12 else
                                     'Afternoon' if 12 <= h < 17 else
                                     'Evening' if 17 <= h < 21 else
                                     'Night')

# Process engagement metrics
engagement_cols = ['Retweets', 'Likes']
for col in engagement_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Feature: Engagement ratio
if all(col in df.columns for col in engagement_cols):                                       #CHECKS IF BOTH 'RETWEETS' AND 'LIKES' COLUMN EXIST IN THE DATAFRAME
    df['EngagementTotal'] = df['Retweets'] + df['Likes']                                    
    df['RetweetRatio'] = df.apply(lambda row: row['Retweets'] / row['EngagementTotal'] 
                                if row['EngagementTotal'] > 0 else 0, axis=1)

# Feature: Text properties
df['TextLength'] = df['ProcessedText'].apply(len)
df['WordCount'] = df['ProcessedText'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
df['AvgWordLength'] = df.apply(lambda row: np.mean([len(word) for word in row['ProcessedText'].split()]) 
                              if row['WordCount'] > 0 else 0, axis=1)

In [36]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,...,HashtagsList,HashtagCount,DayOfWeek,IsWeekend,TimeOfDay,EngagementTotal,RetweetRatio,TextLength,WordCount,AvgWordLength
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:00:00,User123,Twitter,#Nature #Park,15.0,30.0,...,"[#Nature, #Park]",2,6,1,Afternoon,45.0,0.333333,27,4,6.000000
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:00:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,...,"[#Traffic, #Morning]",2,6,1,Morning,15.0,0.333333,24,3,7.333333
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:00:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,...,"[#Fitness, #Workout]",2,6,1,Afternoon,60.0,0.333333,24,3,7.333333
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:00:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,...,"[#Travel, #Adventure]",2,6,1,Evening,23.0,0.347826,32,4,7.250000
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:00:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,...,"[#Cooking, #Food]",2,6,1,Evening,37.0,0.324324,32,5,5.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:00:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,...,"[#ScienceFairWinner, #HighSchoolScience]",2,4,0,Evening,59.0,0.338983,86,10,7.700000
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:00:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,...,"[#SurpriseCelebration, #HighSchoolFriendship]",2,4,0,Afternoon,73.0,0.342466,88,11,7.090909
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:00:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,...,"[#CommunityGiving, #HighSchoolPhilanthropy]",2,4,0,Evening,64.0,0.343750,76,9,7.555556
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:00:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,...,"[#CulturalCelebration, #HighSchoolUnity]",2,5,1,Evening,64.0,0.328125,85,9,8.555556


Cell 4: Exploratory data analysis and sentiment grouping


In [37]:
# Print dataset info
print(f"Dataset shape: {df.shape}")
print(f"Number of unique sentiments: {df['Sentiment'].nunique()}")
print(f"Class distribution (top 10): \n{df['Sentiment'].value_counts().head(10)}")

# Remove empty text
df = df[df['ProcessedText'].str.strip() != '']

# Group similar sentiments to reduce class count
# This is a key improvement to handle the high number of classes
sentiment_groups = {
    'Positive': ['Positive', 'Happy', 'Excited', 'Joy', 'Satisfied', 'Grateful', 'Hopeful', 
                'Optimistic', 'Pleased', 'Cheerful', 'Enthusiastic', 'Delighted', 'Good', 'Great'],
    'Negative': ['Negative', 'Sad', 'Angry', 'Frustrated', 'Disappointed', 'Worried', 'Anxious',
                'Annoyed', 'Upset', 'Concerned', 'Unhappy', 'Distressed', 'Irritated', 'Bad'],
    'Neutral': ['Neutral', 'Indifferent', 'Calm', 'Balanced', 'Objective', 'Normal'],
    'Surprised': ['Surprised', 'Shocked', 'Amazed', 'Astonished'],
    'Confused': ['Confused', 'Uncertain', 'Puzzled', 'Perplexed'],
    'Fear': ['Fear', 'Scared', 'Afraid', 'Terrified', 'Nervous'],
    'Disgust': ['Disgust', 'Repulsed', 'Revolted']
}

# # Create a mapping function
# def map_sentiment(sentiment):
#     for group, values in sentiment_groups.items():
#         if sentiment.lower() in [v.lower() for v in values]:
#             return group
#     return sentiment  # Keep original if not in any group, changed from 'Other'

def map_sentiment(sentiment):
    # Add debugging to see what's happening
    sentiment_lower = sentiment.lower().strip()
    for group, values in sentiment_groups.items():
        values_lower = [v.lower().strip() for v in values]
        if sentiment_lower in values_lower:
            return group
    return sentiment  # Keep original if not in any group

# Apply mapping to reduce number of classes
df['GroupedSentiment'] = df['Sentiment'].apply(map_sentiment)
print(f"Number of unique sentiments after grouping: {df['GroupedSentiment'].nunique()}")

# Validate no duplicates in your sentiment groups
all_values = []
for values in sentiment_groups.values():
    all_values.extend(values)
duplicates = [item for item, count in Counter(all_values).items() if count > 1]
if duplicates:
    print(f"Warning! Duplicate values in sentiment groups: {duplicates}")

# Filter classes with few samples (optional)
min_samples_per_class = 5
class_counts = df["GroupedSentiment"].value_counts()
valid_classes = class_counts[class_counts >= min_samples_per_class].index
df = df[df["GroupedSentiment"].isin(valid_classes)]
print(f"Classes remaining after filtering: {df['GroupedSentiment'].nunique()}")
print(f"Distribution after filtering: \n{df['GroupedSentiment'].value_counts()}")

Dataset shape: (732, 26)
Number of unique sentiments: 279
Class distribution (top 10): 
Sentiment
Positive        44
Joy             42
Excitement      32
Neutral         14
Contentment     14
Happy           14
Hopeful          9
Sad              9
Gratitude        9
Curiosity        8
Name: count, dtype: int64
Number of unique sentiments after grouping: 266
Classes remaining after filtering: 31
Distribution after filtering: 
GroupedSentiment
Positive             116
 Excitement           32
Negative              26
Neutral               18
 Contentment          14
 Gratitude             9
 Embarrassed           8
 Curiosity             8
 Loneliness            7
 Despair               6
 Playful               6
 Hate                  6
 Elation               6
 Curiosity             5
 Empowerment           5
Disgust                5
 Bitterness            5
 Contentment           5
 Serenity              5
 Inspired              5
 Gratitude             5
 Serenity              5
 I

Cell 5: Label encoding and feature selection


In [38]:
# IMPORTANT: Encode labels AFTER filtering
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["GroupedSentiment"])
print(f"Number of samples after filtering: {len(df)}")
print(f"Number of unique sentiments after filtering: {len(np.unique(y))}")

# Define features to use
text_feature = 'ProcessedText'
categorical_features = ['Platform', 'Country', 'TimeOfDay']
numerical_features = ['HashtagCount', 'IsWeekend', 'TextLength', 'WordCount', 'AvgWordLength']

# Filter out any features not in the dataframe
categorical_features = [f for f in categorical_features if f in df.columns]
numerical_features = [f for f in numerical_features if f in df.columns]

print("Using features:")
print(f"- Text feature: {text_feature}")
print(f"- Categorical features: {categorical_features}")
print(f"- Numerical features: {numerical_features}")

# Fill missing values in feature columns
for feat in categorical_features:
    df[feat] = df[feat].fillna('unknown')
for feat in numerical_features:
    df[feat] = df[feat].fillna(0)

Number of samples after filtering: 352
Number of unique sentiments after filtering: 31
Using features:
- Text feature: ProcessedText
- Categorical features: ['Platform', 'Country', 'TimeOfDay']
- Numerical features: ['HashtagCount', 'IsWeekend', 'TextLength', 'WordCount', 'AvgWordLength']


Cell 6: Dataset splitting


In [39]:
# Split dataset AFTER encoding labels
X_text = df[text_feature]
X_categorical = df[categorical_features] if categorical_features else pd.DataFrame()
X_numerical = df[numerical_features] if numerical_features else pd.DataFrame()

# Use stratified split if possible, otherwise use regular split
try:
    X_text_train, X_text_test, X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
        X_text, X_categorical, X_numerical, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Using stratified split")
except ValueError:
    print("Warning: Stratified split not possible. Using random split.")
    X_text_train, X_text_test, X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
        X_text, X_categorical, X_numerical, y, test_size=0.2, random_state=42
    )

print(f"Train set shape: {len(X_text_train)} samples")
print(f"Test set shape: {len(X_text_test)} samples")

Using stratified split
Train set shape: 281 samples
Test set shape: 71 samples


Cell 7: Feature processing


In [40]:
# # Process text features with better TF-IDF parameters
# vectorizer = TfidfVectorizer(
#     ngram_range=(1, 2),
#     max_features=10000,
#     min_df=2,                                                               #HERE DF IN MIN_DF AND MAX_DF STAND FOR "DOCUMENT FREQUENCY". MIN_DF = 2 MEANS IGNORE TERMS THAT APPEAR IN FEWER THAN 2 DOCUMENTS
#     max_df=0.9,                                                                 #MAX_DF = 0.9 means that ignore words that appear in more than 90% of the documents (NOTEThat: MIN_DF USES INTEGER VALUES WHILE MAX_DF USES FRACTIONAL VALUES )
#     sublinear_tf=True  # Apply sublinear tf scaling (logarithmic)           #APPLY LOGARITHMIC SCALING TO TF (TO REDUCE THE IMPACT OF VERY HIGH FREQUENCY TERMS)
# )
# X_text_train_tfidf = vectorizer.fit_transform(X_text_train)
# X_text_test_tfidf = vectorizer.transform(X_text_test)

def text_to_glove_embedding(text, glove_vectors, dim=100):
    words = text.split()
    # Initialize embeddings array
    embedding = np.zeros(dim)
    count = 0
    # Average embeddings of all words in the text
    for word in words:
        if word in glove_vectors.stoi:
            # Get word index and corresponding vector
            idx = glove_vectors.stoi[word]
            embedding += glove_vectors.vectors[idx].numpy()
            count += 1
    # Take average, handle empty texts
    if count > 0:
        embedding /= count
    return embedding

print("Converting text to GloVe embeddings...")
X_text_train_glove = np.array([text_to_glove_embedding(text, glove, glove_dim) 
                              for text in X_text_train])
X_text_test_glove = np.array([text_to_glove_embedding(text, glove, glove_dim)
                            for text in X_text_test])

print(f"Text features shape after GloVe: {X_text_train_glove.shape}")

# Process categorical features
if not X_categorical.empty:
    categorical_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_cat_train_encoded = categorical_encoder.fit_transform(X_cat_train)
    X_cat_test_encoded = categorical_encoder.transform(X_cat_test)
    print(f"Categorical features shape: {X_cat_train_encoded.shape}")
else:
    X_cat_train_encoded = np.empty((X_text_train.shape[0], 0))
    X_cat_test_encoded = np.empty((X_text_test.shape[0], 0))

# Process numerical features
if not X_numerical.empty:
    scaler = StandardScaler()
    X_num_train_scaled = scaler.fit_transform(X_num_train)
    X_num_test_scaled = scaler.transform(X_num_test)
    print(f"Numerical features shape: {X_num_train_scaled.shape}")
else:
    X_num_train_scaled = np.empty((X_text_train.shape[0], 0))
    X_num_test_scaled = np.empty((X_text_test.shape[0], 0))

# Combine all features
X_train_combined = np.hstack((X_text_train_glove, X_cat_train_encoded, X_num_train_scaled))
X_test_combined = np.hstack((X_text_test_glove, X_cat_test_encoded, X_num_test_scaled))
print(f"Combined features shape: {X_train_combined.shape}")
X_train_combined

Converting text to GloVe embeddings...
Text features shape after GloVe: (281, 100)
Categorical features shape: (281, 54)
Numerical features shape: (281, 5)
Combined features shape: (281, 159)


array([[ 0.36447621,  0.3234956 , -0.06897601, ..., -1.21670014,
        -1.1583153 , -0.57603418],
       [ 0.16918682,  0.20915571,  0.30206886, ..., -0.2066176 ,
        -0.47523744,  0.83267706],
       [ 0.067458  ,  0.21078275,  0.29921475, ..., -0.00460109,
        -0.13369851,  0.36655937],
       ...,
       [-0.11144333,  0.49449112, -0.11655111, ...,  1.28830457,
         0.89091828,  0.970786  ],
       [-0.05229475,  0.13861249,  0.32730749, ..., -1.49952325,
        -1.49985423, -0.47935792],
       [-0.1832045 ,  0.06354636,  0.42283182, ...,  0.76306165,
         0.89091828, -0.17175164]], shape=(281, 159))

Cell 8: Dataset preparation for PyTorch


In [41]:
# Apply class weights to handle imbalanced dataset
class_counts = Counter(y_train)
total_samples = len(y_train)
class_weights = {class_idx: total_samples / (len(class_counts) * count) for class_idx, count in class_counts.items()}           #CLASS_WEIGHTS REFER TO THE IMPORTANCE THAT NEED TO BE GIVEN TO EACH CLASS (TO GIVE HIGHER WEIGHTS TO UNDERREPRESENTED CLASSES, AND LOWER WEIGHTS TO OTHERS)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
weights = torch.FloatTensor([class_weights.get(i, 1.0) for i in range(len(np.unique(y)))]).to(device)                           #CREATE A PYTORCH TENSOR FOR CLASS WEIGHTS

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_combined)
X_test_tensor = torch.FloatTensor(X_test_combined)
y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Increase batch size if classes are reduced
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Cell 9: Define neural network model


In [42]:
# # Define the improved model with better architecture
# class ImprovedSentimentClassifier(nn.Module):
#     def __init__(self, input_dim, num_classes, dropout_rate=0.3):
#         super(ImprovedSentimentClassifier, self).__init__()
        
#         # Improved architecture with better capacity control
#         self.layer1 = nn.Linear(input_dim, 512)
#         self.bn1 = nn.BatchNorm1d(512)
#         self.dropout1 = nn.Dropout(dropout_rate)
        
#         self.layer2 = nn.Linear(512, 256)
#         self.bn2 = nn.BatchNorm1d(256)
#         self.dropout2 = nn.Dropout(dropout_rate)
        
#         self.layer3 = nn.Linear(256, 128)
#         self.bn3 = nn.BatchNorm1d(128)
#         self.dropout3 = nn.Dropout(dropout_rate)
        
#         self.output_layer = nn.Linear(128, num_classes)
    
#     def forward(self, x):
#         # Use ELU activation for better gradient flow
#         x = torch.nn.functional.elu(self.layer1(x))
#         x = self.bn1(x)
#         x = self.dropout1(x)
        
#         x = torch.nn.functional.elu(self.layer2(x))
#         x = self.bn2(x)
#         x = self.dropout2(x)
        
#         x = torch.nn.functional.elu(self.layer3(x))
#         x = self.bn3(x)
#         x = self.dropout3(x)
        
#         return self.output_layer(x)

class GloVeSentimentClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super(GloVeSentimentClassifier, self).__init__()
        
        # Adjust architecture for GloVe embeddings
        self.layer1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.layer2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.output_layer = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = torch.nn.functional.elu(self.layer1(x))
        x = self.bn1(x)
        x = self.dropout1(x)
        
        x = torch.nn.functional.elu(self.layer2(x))
        x = self.bn2(x)
        x = self.dropout2(x)
        
        return self.output_layer(x)

Cell 10: Initialize model and training components


In [43]:
# Set device
print(f"Using device: {device}")

# Initialize model, loss function, and optimizer
input_dim = X_train_combined.shape[1]
num_classes = len(np.unique(y))
print(f"Building model with {input_dim} input features and {num_classes} output classes")

model = GloVeSentimentClassifier(input_dim, num_classes).to(device)

# Use weighted cross entropy loss to handle imbalanced classes
criterion = nn.CrossEntropyLoss(weight=weights)

# Use AdamW optimizer with weight decay for better regularization
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# Learning rate scheduler with cosine annealing for better convergence
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

Using device: cpu
Building model with 159 input features and 31 output classes


Cell 11: Define training and evaluation functions


In [44]:
# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()


        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return running_loss / total, correct / total

# Evaluation function
def evaluate(model, data_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return running_loss / total, correct / total, all_preds, all_labels

Cell 12: Training loop with early stopping


In [45]:
# Training loop with early stopping
num_epochs = 100
best_accuracy = 0.0
patience = 7
no_improvement = 0

try:
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc, _, _ = evaluate(model, test_loader, criterion, device)

        # Learning rate scheduler step
        scheduler.step()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        # Save best model and check for early stopping
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'accuracy': val_acc,
    'label_encoder': label_encoder,
    'glove_dim': glove_dim,
    'glove_name': '6B',  # Adding the specific GloVe model name
    'categorical_encoder': categorical_encoder if not X_categorical.empty else None,
    'scaler': scaler if not X_numerical.empty else None,
    'categorical_features': categorical_features,
    'numerical_features': numerical_features,
}, 'best_sentiment_model.pth')
            print(f'Model saved at epoch {epoch+1} with validation accuracy: {val_acc:.4f}')
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement == patience:
                print(f'Early stopping after {patience} epochs without improvement')
                break

except KeyboardInterrupt:
    print("Training interrupted!")

Epoch 1/100, Train Loss: 3.4438, Train Acc: 0.0356, Val Loss: 3.3058, Val Acc: 0.1268


Model saved at epoch 1 with validation accuracy: 0.1268
Epoch 2/100, Train Loss: 2.3829, Train Acc: 0.2349, Val Loss: 3.0377, Val Acc: 0.1972
Model saved at epoch 2 with validation accuracy: 0.1972
Epoch 3/100, Train Loss: 1.8336, Train Acc: 0.3594, Val Loss: 2.6726, Val Acc: 0.2535
Model saved at epoch 3 with validation accuracy: 0.2535
Epoch 4/100, Train Loss: 1.4336, Train Acc: 0.4448, Val Loss: 2.3041, Val Acc: 0.2817
Model saved at epoch 4 with validation accuracy: 0.2817
Epoch 5/100, Train Loss: 1.1054, Train Acc: 0.4875, Val Loss: 2.0093, Val Acc: 0.3239
Model saved at epoch 5 with validation accuracy: 0.3239
Epoch 6/100, Train Loss: 0.8709, Train Acc: 0.5196, Val Loss: 1.7726, Val Acc: 0.3521
Model saved at epoch 6 with validation accuracy: 0.3521
Epoch 7/100, Train Loss: 0.7310, Train Acc: 0.5623, Val Loss: 1.5779, Val Acc: 0.3944
Model saved at epoch 7 with validation accuracy: 0.3944
Epoch 8/100, Train Loss: 0.5990, Train Acc: 0.6085, Val Loss: 1.4668, Val Acc: 0.3944
Epoch 

Cell 13: Final model evaluation


In [46]:
# Load the best model for final evaluation
try:
    # Add these lines to fix the LabelEncoder loading issue
    from sklearn.preprocessing import LabelEncoder
    torch.serialization.add_safe_globals([LabelEncoder])
    
    # Now load with weights_only=False to allow loading Python objects
    checkpoint = torch.load('best_sentiment_model.pth', weights_only=False)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model from epoch {checkpoint['epoch']+1} with validation accuracy: {checkpoint['accuracy']:.4f}")
except FileNotFoundError:
    print("Best model file not found. Continuing with current model state.")

# Final evaluation on test set
test_loss, test_acc, all_preds, all_labels = evaluate(model, test_loader, criterion, device)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")
print("Classification Report:\n", classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=1))

Loaded best model from epoch 16 with validation accuracy: 0.5775

Final Test Accuracy: 0.5775
Classification Report:
                    precision    recall  f1-score   support

 Acceptance             0.50      1.00      0.67         1
 Ambivalence            1.00      1.00      1.00         1
      Bitterness        1.00      1.00      1.00         1
 Confusion              1.00      1.00      1.00         1
     Contentment        0.50      0.33      0.40         3
   Contentment          1.00      0.00      0.00         1
       Curiosity        0.33      0.50      0.40         2
 Curiosity              1.00      1.00      1.00         1
         Despair        0.50      1.00      0.67         1
 Determination          1.00      1.00      1.00         1
   Elation              0.33      1.00      0.50         1
     Embarrassed        0.00      0.00      0.00         2
   Empowerment          1.00      0.00      0.00         1
   Enthusiasm           0.33      1.00      0.50       