In [1]:
# ========== IMPORTS & SETUP ==========
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
import re
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [None]:
# Files checking
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/sample_submission.csv
/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/train_dataset.csv
/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/test_dataset.csv
/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/val_dataset.csv


In [None]:
# ========== DATA LOADING ==========
BASE_PATH = '../input'
train_df = pd.read_csv(os.path.join(BASE_PATH, 'train_dataset.csv'))
val_df = pd.read_csv(os.path.join(BASE_PATH, 'val_dataset.csv'))
test_df = pd.read_csv(os.path.join(BASE_PATH, 'test_dataset.csv'))

In [4]:
# ========== CONSTANTS ==========
DEFAULT_BATCH_SIZE = 32  # Used for test loader and initial setup

In [5]:
# ========== PREPROCESSING ==========
stop_words = set(stopwords.words('english'))
w2v_model = KeyedVectors.load_word2vec_format(
    "/kaggle/input/google-word2vec/GoogleNews-vectors-negative300.bin",
    binary=True
)

def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token not in stop_words]

for df in [train_df, val_df, test_df]:
    df['tokens'] = df['Text'].apply(preprocess_text)

In [6]:
# ========== VECTORIZATION ==========
def vectorize(tokens_list):
    vectors = []
    for tokens in tokens_list:
        vecs = [w2v_model[token] for token in tokens if token in w2v_model]
        vectors.append(np.mean(vecs, axis=0) if vecs else np.zeros(300))
    return np.array(vectors)

train_vectors = vectorize(train_df['tokens'])
val_vectors = vectorize(val_df['tokens'])
test_vectors = vectorize(test_df['tokens'])

In [7]:
# ========== DATASETS & DATALOADERS ==========
class VectorDataset(Dataset):
    def __init__(self, vectors, labels=None):
        self.vectors = vectors
        self.labels = labels

    def __len__(self):
        return len(self.vectors)

    def __getitem__(self, idx):
        if self.labels is not None:
            return torch.tensor(self.vectors[idx], dtype=torch.float32), \
                   torch.tensor(self.labels[idx], dtype=torch.float32)
        return torch.tensor(self.vectors[idx], dtype=torch.float32)

# Initialize datasets
train_dataset = VectorDataset(train_vectors, train_df['Label'].values)
val_dataset = VectorDataset(val_vectors, val_df['Label'].values)
test_dataset = VectorDataset(test_vectors)  # No labels

# Initialize DEFAULT loaders (before Optuna)
train_loader = DataLoader(train_dataset, batch_size=DEFAULT_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=DEFAULT_BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=DEFAULT_BATCH_SIZE)  # Fixed for submission

In [8]:
# ========== MODEL ARCHITECTURE ==========
class SentimentNet(nn.Module):
    def __init__(self, input_dim, hidden1=256, hidden2=128, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            nn.BatchNorm1d(hidden1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.BatchNorm1d(hidden2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 1)
        )
        
    def forward(self, x):
        return self.net(x)

In [9]:
# ========== OPTUNA OPTIMIZATION ==========
def objective(trial):
    params = {
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),
        'hidden1': trial.suggest_int('hidden1', 128, 512, step=64),
        'hidden2': trial.suggest_int('hidden2', 64, 256, step=64),
        'dropout': trial.suggest_float('dropout', 0.2, 0.5),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    }

    # Temporary loaders with trial-specific batch size
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentimentNet(300, params['hidden1'], params['hidden2'], params['dropout']).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    criterion = nn.BCEWithLogitsLoss()
    
    best_f1 = 0
    patience = 5
    for epoch in range(100):
        # Training
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device).unsqueeze(1)
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        # Validation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                outputs = model(x)
                preds = (torch.sigmoid(outputs) > 0.5).long().flatten()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y.long().cpu().numpy())
        
        val_f1 = f1_score(all_labels, all_preds, zero_division=0)
        trial.report(val_f1, epoch)
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    return best_f1

# Run Optuna study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30, timeout=10800)  # 3 hours

[I 2025-05-05 09:53:18,942] A new study created in memory with name: no-name-1e90f0c0-6a63-400f-bc27-1816267f8d94
[I 2025-05-05 09:56:27,839] Trial 0 finished with value: 0.7659787253386672 and parameters: {'lr': 6.717992572586051e-05, 'hidden1': 512, 'hidden2': 128, 'dropout': 0.291077865807791, 'batch_size': 32, 'weight_decay': 5.215535689749298e-05}. Best is trial 0 with value: 0.7659787253386672.
[I 2025-05-05 09:58:14,274] Trial 1 finished with value: 0.7493994826311899 and parameters: {'lr': 3.616638715545857e-05, 'hidden1': 128, 'hidden2': 128, 'dropout': 0.49112290944488207, 'batch_size': 128, 'weight_decay': 1.985763403794658e-05}. Best is trial 0 with value: 0.7659787253386672.
[I 2025-05-05 10:02:00,229] Trial 2 finished with value: 0.7662561067914402 and parameters: {'lr': 0.00017433981239563378, 'hidden1': 256, 'hidden2': 256, 'dropout': 0.481332733360663, 'batch_size': 32, 'weight_decay': 6.158409598055872e-06}. Best is trial 2 with value: 0.7662561067914402.
[I 2025-05-0

In [10]:
# ========== FINAL TRAINING ==========
best_params = study.best_params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Reinitialize with best parameters
final_model = SentimentNet(
    input_dim=300,
    hidden1=best_params['hidden1'],
    hidden2=best_params['hidden2'],
    dropout=best_params['dropout']
).to(device)

optimizer = torch.optim.AdamW(
    final_model.parameters(),
    lr=best_params['lr'],
    weight_decay=best_params['weight_decay']
)
criterion = nn.BCEWithLogitsLoss()

# FINAL DATA LOADERS (using best batch size for train/val)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'])
# test_loader remains with DEFAULT_BATCH_SIZE (defined earlier)

# Training loop
best_f1 = 0
patience = 10
for epoch in range(200):
    final_model.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device).unsqueeze(1)
        optimizer.zero_grad()
        outputs = final_model(x)
        loss = criterion(outputs, y)
        loss.backward()
        nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        optimizer.step()
    
    # Validation
    final_model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            outputs = final_model(x)
            preds = (torch.sigmoid(outputs) > 0.5).long().flatten()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.long().cpu().numpy())
    
    val_f1 = f1_score(all_labels, all_preds, zero_division=0)
    print(f"Epoch {epoch+1}: Val F1 = {val_f1:.4f}")
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(final_model.state_dict(), 'best_model.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

Epoch 1: Val F1 = 0.7246
Epoch 2: Val F1 = 0.7304
Epoch 3: Val F1 = 0.7619
Epoch 4: Val F1 = 0.7656
Epoch 5: Val F1 = 0.7550
Epoch 6: Val F1 = 0.7525
Epoch 7: Val F1 = 0.7606
Epoch 8: Val F1 = 0.7609
Epoch 9: Val F1 = 0.7559
Epoch 10: Val F1 = 0.7591
Epoch 11: Val F1 = 0.7493
Epoch 12: Val F1 = 0.7405
Epoch 13: Val F1 = 0.7596
Epoch 14: Val F1 = 0.7667
Epoch 15: Val F1 = 0.7446
Epoch 16: Val F1 = 0.7658
Epoch 17: Val F1 = 0.7650
Epoch 18: Val F1 = 0.7630
Epoch 19: Val F1 = 0.7696
Epoch 20: Val F1 = 0.7458
Epoch 21: Val F1 = 0.7633
Epoch 22: Val F1 = 0.7418
Epoch 23: Val F1 = 0.7612
Epoch 24: Val F1 = 0.7603
Epoch 25: Val F1 = 0.7618
Epoch 26: Val F1 = 0.7501
Epoch 27: Val F1 = 0.7619
Epoch 28: Val F1 = 0.7501
Epoch 29: Val F1 = 0.7446
Early stopping at epoch 29


In [11]:
# ========== FINAL EVALUATION & SUBMISSION ==========
final_model.load_state_dict(torch.load('best_model.pth'))
final_model.eval()

# 1. Validation Evaluation
val_preds, val_labels = [], []
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        outputs = final_model(x)
        preds = (torch.sigmoid(outputs) > 0.5).long().flatten()
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(y.cpu().numpy())

print("\n=== Validation Metrics ===")
print(f"Accuracy: {accuracy_score(val_labels, val_preds):.4f}")
print(f"Precision: {precision_score(val_labels, val_preds):.4f}") 
print(f"Recall: {recall_score(val_labels, val_preds):.4f}")
print(f"F1 Score: {f1_score(val_labels, val_preds):.4f}")

# 2. Test Predictions (using pre-defined test_loader)
test_preds = []
with torch.no_grad():
    for x in test_loader:
        x = x.to(device)
        outputs = final_model(x)
        preds = (torch.sigmoid(outputs) > 0.5).long().flatten()
        test_preds.extend(preds.cpu().numpy())

# Create submission
submission_df = pd.DataFrame({
    "ID": test_df["ID"].values,  # Ensures proper alignment
    "Label": test_preds
})
submission_df.to_csv("submission.csv", index=False)
print("\n=== Submission Created ===")
print(f"Samples: {len(test_preds)}")
print(f"Class Distribution:\n{submission_df['Label'].value_counts()}")

# Save final model
torch.save(final_model.state_dict(), 'sentiment_model.pth')

  final_model.load_state_dict(torch.load('best_model.pth'))



=== Validation Metrics ===
Accuracy: 0.7584
Precision: 0.7356
Recall: 0.8069
F1 Score: 0.7696

=== Submission Created ===
Samples: 21199
Class Distribution:
Label
1    11763
0     9436
Name: count, dtype: int64
