<a href="https://colab.research.google.com/github/22f3000982/DL_genai_project/blob/main/notebooks/model1_scratch_tfidf_logreg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install wandb --quiet

import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

import wandb

# ---- Colab ke liye W&B login ----
wandb.login()


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22f3000982[0m ([33m22f3000982-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [14]:
WANDB_PROJECT = "2025-sep-dl-genai-project"
RUN_NAME = "model1_scratch_tfidf_logreg"

wandb.init(
    project=WANDB_PROJECT,
    name=RUN_NAME,
    config={
        "model": "tfidf_logreg_scratch",
        "test_size": 0.10,
        "max_features": 30000,
        "ngram_range": (1, 2),
        "C": 4.0,
        "random_state": 42
    }
)

config = wandb.config


In [13]:
train_df = pd.read_csv("/content/train (3).csv")

labels = ["anger", "fear", "joy", "sadness", "surprise"]
print(train_df.head())
print("Train shape:", train_df.shape)


   id                                               text  anger  fear  joy  \
0   0  the dentist that did the work apparently did a...      1     0    0   
1   1  i'm gonna absolutely ~~suck~~ be terrible duri...      0     1    0   
2   2  bridge: so leave me drowning calling houston, ...      0     1    0   
3   3  after that mess i went to see my now ex-girlfr...      1     1    0   
4   4  as he stumbled i ran off, afraid it might some...      0     1    0   

   sadness  surprise                    emotions  
0        1         0         ['anger' 'sadness']  
1        1         0          ['fear' 'sadness']  
2        1         0          ['fear' 'sadness']  
3        1         0  ['anger' 'fear' 'sadness']  
4        0         0                    ['fear']  
Train shape: (6827, 8)


In [15]:
def clean_text(text):
    text = str(text).lower()
    # URLs hatao
    text = re.sub(r"http\S+", " ", text)
    # punctuation hatao
    text = text.translate(str.maketrans("", "", string.punctuation))
    # extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["clean_text"] = train_df["text"].apply(clean_text)
train_df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,the dentist that did the work apparently did a...,the dentist that did the work apparently did a...
1,i'm gonna absolutely ~~suck~~ be terrible duri...,im gonna absolutely suck be terrible during my...
2,"bridge: so leave me drowning calling houston, ...",bridge so leave me drowning calling houston an...
3,after that mess i went to see my now ex-girlfr...,after that mess i went to see my now exgirlfri...
4,"as he stumbled i ran off, afraid it might some...",as he stumbled i ran off afraid it might someh...


In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df["clean_text"],
    train_df[labels].values,
    test_size=config.test_size,
    random_state=config.random_state
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))


Train size: 6144
Val size: 683


In [17]:
# TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=config.max_features,
    ngram_range=(1, 2)
)


X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# One-vs-rest logistic regression (multi-label)
clf = OneVsRestClassifier(
    LogisticRegression(
        C=config.C,
        max_iter=250,
        n_jobs=-1,
        solver="lbfgs"
    )
)

clf.fit(X_train_tfidf, y_train)
print("Model trained!")


Model trained!


In [18]:
from scipy.special import expit  # sigmoid

# decision_function -> raw scores, sigmoid -> probabilities
val_scores = clf.decision_function(X_val_tfidf)
val_probs = expit(val_scores)

THRESH = 0.5
val_pred = (val_probs >= THRESH).astype(int)

f1_macro = f1_score(y_val, val_pred, average="macro")
f1_micro = f1_score(y_val, val_pred, average="micro")
accuracy = accuracy_score(y_val, val_pred)

print(f"Threshold: {THRESH}")
print("Macro F1:", f1_macro)
print("Micro F1:", f1_micro)
print("Accuracy:", accuracy)

print("\nPer-label F1:")
for i, lab in enumerate(labels):
    f1_lab = f1_score(y_val[:, i], val_pred[:, i])
    print(f"{lab}: {f1_lab:.4f}")

# ---- W&B logging ----
wandb.log(
    {
        "val_macro_f1": f1_macro,
        "val_micro_f1": f1_micro,
        "val_accuracy": accuracy,
        "threshold": THRESH,
    },
    step=1,        # <<-- step explicitly set
)

wandb.finish()


Threshold: 0.5
Macro F1: 0.6740861805795403
Micro F1: 0.7586206896551724
Accuracy: 0.5344070278184481

Per-label F1:
anger: 0.3800
fear: 0.8585
joy: 0.6846
sadness: 0.7154
surprise: 0.7318


0,1
threshold,▁
val_accuracy,▁
val_macro_f1,▁
val_micro_f1,▁

0,1
threshold,0.5
val_accuracy,0.53441
val_macro_f1,0.67409
val_micro_f1,0.75862
