<a href="https://colab.research.google.com/github/GGoliathan/MLGroupProject/blob/main/FinalMLProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
import os

# NEW Kaggle API authentication (ONLY token needed)
os.environ["KAGGLE_API_TOKEN"] = "KGAT_b0b3bce7e79f0e1028621d7fcb2d7546"

# Remove old credentials to avoid conflicts
!rm -f ~/.kaggle/kaggle.json
!rm -f ~/.config/kaggle/kaggle.json

print("Kaggle API Token set. Ready.")


Kaggle API Token set. Ready.


In [129]:
!kaggle competitions list


ref                                                                                 deadline             category                reward  teamCount  userHasEntered  
----------------------------------------------------------------------------------  -------------------  ---------------  -------------  ---------  --------------  
https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-3       2026-04-15 23:59:00  Featured         2,207,152 Usd        435           False  
https://www.kaggle.com/competitions/hull-tactical-market-prediction                 2025-12-15 23:59:00  Featured           100,000 Usd       2911           False  
https://www.kaggle.com/competitions/vesuvius-challenge-surface-detection            2026-02-13 23:59:00  Research           100,000 Usd        220           False  
https://www.kaggle.com/competitions/google-tunix-hackathon                          2026-01-12 23:59:00  Featured           100,000 Usd         65           False  
https://ww

In [130]:
!kaggle competitions download -c nlp-getting-started -p /content

import zipfile

with zipfile.ZipFile('/content/nlp-getting-started.zip', 'r') as z:
    z.extractall('/content')

print("Dataset downloaded & extracted.")



nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset downloaded & extracted.


In [131]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv')
test_df  = pd.read_csv('/content/test.csv')

train_df.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [132]:
from sklearn.model_selection import train_test_split

X = train_df["text"]
y = train_df["target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)

logreg = LogisticRegression(max_iter=1000, solver="liblinear")
logreg.fit(X_train_tfidf, y_train)

lr_pred = logreg.predict(X_val_tfidf)

accuracy_lr  = accuracy_score(y_val, lr_pred)
precision_lr = precision_score(y_val, lr_pred)
recall_lr    = recall_score(y_val, lr_pred)
f1_lr        = f1_score(y_val, lr_pred)

print("=== Logistic Regression (TF-IDF) ===")
print("Accuracy:", accuracy_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_lr)


=== Logistic Regression (TF-IDF) ===
Accuracy: 0.814182534471438
Precision: 0.8378870673952641
Recall: 0.7033639143730887
F1 Score: 0.7647547797173733


In [134]:
import torch
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def encode_batch(texts):
    return tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

train_enc = encode_batch(X_train)
val_enc   = encode_batch(X_val)

train_labels = torch.tensor(y_train.values)
val_labels   = torch.tensor(y_val.values)


In [135]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(
    train_enc["input_ids"], train_enc["attention_mask"], train_labels
)

val_dataset = TensorDataset(
    val_enc["input_ids"], val_enc["attention_mask"], val_labels
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)


In [136]:
config = {
    "epochs": 50,
    "train_batch_size": 16,
    "eval_batch_size": 32,
    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.20,          # increased warmup
    "gradient_accumulation_steps": 4,
    "max_grad_norm": 1.0,
    "use_amp": True,
    "early_stopping_patience": 14, # a little longer
    "max_length": 160,
    "label_smoothing": 0.05        # reduced from 0.10
}


In [137]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import torch
from torch.nn.functional import cross_entropy
from sklearn.metrics import f1_score, accuracy_score

# Load training config
epochs = config["epochs"]
accum_steps = config["gradient_accumulation_steps"]
lr = config["learning_rate"]
weight_decay = config["weight_decay"]
warmup_ratio = config["warmup_ratio"]
max_grad_norm = config["max_grad_norm"]
use_amp = config["use_amp"]
patience = config["early_stopping_patience"]
label_smoothing = config.get("label_smoothing", 0.0)  # NEW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom loss function with label smoothing
def compute_loss(logits, labels):
    return cross_entropy(
        logits,
        labels,
        label_smoothing=label_smoothing
    )

# Optimizer + scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

total_steps = int(len(train_loader) // accum_steps * epochs)
warmup_steps = int(total_steps * warmup_ratio)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# AMP scaler
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# Early stopping
best_val_f1 = 0.0
best_epoch = 0
no_improve = 0

print(" Starting DistilBERT fine-tuning...")

# ---------------------- TRAINING LOOP -----------------------------

for epoch in range(1, epochs + 1):
    model.train()
    optimizer.zero_grad()

    train_loop = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} Training")

    for step, batch in enumerate(train_loop, start=1):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = model(
                input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits

            # custom loss with label smoothing
            loss = compute_loss(logits, labels) / accum_steps

        scaler.scale(loss).backward()

        if step % accum_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        train_loop.set_postfix({"loss": loss.item() * accum_steps})

    # ---------------------- VALIDATION -----------------------------

    model.eval()
    preds, trues = [], []

    val_loop = tqdm(val_loader, desc="Evaluating")

    with torch.no_grad():
        for batch in val_loop:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            logits = model(input_ids, attention_mask=attention_mask).logits

            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            trues.extend(labels.cpu().numpy())

    val_f1 = f1_score(trues, preds, average="binary", zero_division=0)
    val_acc = accuracy_score(trues, preds)

    print(f"\nEpoch {epoch} — Val F1: {val_f1:.4f}, Val Acc: {val_acc:.4f}\n")

    # ---------------------- EARLY STOPPING -----------------------------

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_epoch = epoch
        no_improve = 0
        torch.save(model.state_dict(), "best_model.pt")
        print("Saved new best model.")
    else:
        no_improve += 1
        print(f"No improvement ({no_improve}/{patience})")
        if no_improve >= patience:
            print("Early stopping triggered.")
            break

print(f"Training complete! Best F1 = {best_val_f1:.4f} at epoch {best_epoch}")



 Starting DistilBERT fine-tuning...


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


Epoch 1/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 1 — Val F1: 0.7786, Val Acc: 0.8043

Saved new best model.


Epoch 2/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 2 — Val F1: 0.7920, Val Acc: 0.8221

Saved new best model.


Epoch 3/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 3 — Val F1: 0.7916, Val Acc: 0.8247

No improvement (1/16)


Epoch 4/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 4 — Val F1: 0.7881, Val Acc: 0.8175

No improvement (2/16)


Epoch 5/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 5 — Val F1: 0.7875, Val Acc: 0.8175

No improvement (3/16)


Epoch 6/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 6 — Val F1: 0.7890, Val Acc: 0.8188

No improvement (4/16)


Epoch 7/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 7 — Val F1: 0.7932, Val Acc: 0.8234

Saved new best model.


Epoch 8/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 8 — Val F1: 0.7848, Val Acc: 0.8181

No improvement (1/16)


Epoch 9/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 9 — Val F1: 0.7861, Val Acc: 0.8181

No improvement (2/16)


Epoch 10/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 10 — Val F1: 0.7789, Val Acc: 0.8043

No improvement (3/16)


Epoch 11/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 11 — Val F1: 0.7794, Val Acc: 0.8063

No improvement (4/16)


Epoch 12/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 12 — Val F1: 0.7813, Val Acc: 0.8096

No improvement (5/16)


Epoch 13/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 13 — Val F1: 0.7947, Val Acc: 0.8260

Saved new best model.


Epoch 14/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 14 — Val F1: 0.7863, Val Acc: 0.8116

No improvement (1/16)


Epoch 15/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 15 — Val F1: 0.7894, Val Acc: 0.8181

No improvement (2/16)


Epoch 16/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 16 — Val F1: 0.7864, Val Acc: 0.8181

No improvement (3/16)


Epoch 17/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 17 — Val F1: 0.7918, Val Acc: 0.8207

No improvement (4/16)


Epoch 18/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 18 — Val F1: 0.7864, Val Acc: 0.8148

No improvement (5/16)


Epoch 19/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 19 — Val F1: 0.7812, Val Acc: 0.8201

No improvement (6/16)


Epoch 20/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 20 — Val F1: 0.7886, Val Acc: 0.8240

No improvement (7/16)


Epoch 21/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 21 — Val F1: 0.7894, Val Acc: 0.8221

No improvement (8/16)


Epoch 22/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 22 — Val F1: 0.7903, Val Acc: 0.8240

No improvement (9/16)


Epoch 23/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 23 — Val F1: 0.7908, Val Acc: 0.8207

No improvement (10/16)


Epoch 24/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 24 — Val F1: 0.7800, Val Acc: 0.8004

No improvement (11/16)


Epoch 25/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 25 — Val F1: 0.7894, Val Acc: 0.8175

No improvement (12/16)


Epoch 26/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 26 — Val F1: 0.7871, Val Acc: 0.8142

No improvement (13/16)


Epoch 27/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 27 — Val F1: 0.7890, Val Acc: 0.8188

No improvement (14/16)


Epoch 28/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 28 — Val F1: 0.7894, Val Acc: 0.8181

No improvement (15/16)


Epoch 29/60 Training:   0%|          | 0/191 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]


Epoch 29 — Val F1: 0.7893, Val Acc: 0.8188

No improvement (16/16)
Early stopping triggered.
Training complete! Best F1 = 0.7947 at epoch 13


In [138]:
model.eval()
bert_preds = []

with torch.no_grad():
    for batch in val_loader:
        ids, mask, _ = [b.to(device) for b in batch]
        logits = model(ids, attention_mask=mask).logits
        bert_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

accuracy_bert  = accuracy_score(y_val, bert_preds)
precision_bert = precision_score(y_val, bert_preds)
recall_bert    = recall_score(y_val, bert_preds)
f1_bert        = f1_score(y_val, bert_preds)

print("=== DistilBERT Results ===")
print("Accuracy:", accuracy_bert)
print("Precision:", precision_bert)
print("Recall:", recall_bert)
print("F1 Score:", f1_bert)


=== DistilBERT Results ===
Accuracy: 0.8187787261982928
Precision: 0.788109756097561
Recall: 0.790519877675841
F1 Score: 0.7893129770992366


In [139]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "DistilBERT"],
    "Accuracy": [accuracy_lr, accuracy_bert],
    "Precision": [precision_lr, precision_bert],
    "Recall": [recall_lr, recall_bert],
    "F1 Score": [f1_lr, f1_bert]
})

results


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.814183,0.837887,0.703364,0.764755
1,DistilBERT,0.818779,0.78811,0.79052,0.789313
