In [None]:
import numpy as np
import pandas as pd
import re
import string
import zipfile
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, f1_score, hamming_loss, jaccard_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7b715f733ef0>

In [None]:
df = pd.read_csv("movies.csv")

allowed_cols = [
    "overview",
    "tagline",
    "keywords",
    "genres",
    "vote_average"
]

df = df[allowed_cols].dropna()
df.rename(columns={"vote_average": "voting_average"}, inplace=True)

df.head()

Unnamed: 0,overview,tagline,keywords,genres,voting_average
0,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction,7.2
1,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.",ocean drug abuse exotic island east india trad...,Adventure Fantasy Action,6.9
2,A cryptic message from Bond’s past sends him o...,A Plan No One Escapes,spy based on novel secret agent sequel mi6,Action Adventure Crime,6.3
3,Following the death of District Attorney Harve...,The Legend Ends,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller,7.6
4,"John Carter is a war-weary, former military ca...","Lost in our world, found in another.",based on novel mars medallion space travel pri...,Action Adventure Science Fiction,6.1


## Preprocessing

In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()

    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in tagged
    ]

    return " ".join(lemmatized_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
for col in ["overview", "tagline", "keywords"]:
    df[col] = df[col].astype(str).apply(clean_text)

In [None]:
df["genres"] = df["genres"].astype(str).apply(lambda x: x.split())

mlb = MultiLabelBinarizer()
Y_genre = mlb.fit_transform(df["genres"])

print("Number of genres:", len(mlb.classes_))
print("Genres:", mlb.classes_)

Number of genres: 22
Genres: ['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Fiction' 'Foreign' 'History' 'Horror' 'Movie' 'Music'
 'Mystery' 'Romance' 'Science' 'TV' 'Thriller' 'War' 'Western']


In [None]:
print(df["genres"].iloc[0])
print(Y_genre[0])

['Action', 'Adventure', 'Fantasy', 'Science', 'Fiction']
[1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0]


In [None]:
df.head()

Unnamed: 0,overview,tagline,keywords,genres,voting_average
0,in the nd century a paraplegic marine be dispa...,enter the world of pandora,culture clash future space war space colony so...,"[Action, Adventure, Fantasy, Science, Fiction]",7.2
1,captain barbossa long believe to be dead have ...,at the end of the world the adventure begin,ocean drug abuse exotic island east india trad...,"[Adventure, Fantasy, Action]",6.9
2,a cryptic message from bond ’ s past sends him...,a plan no one escape,spy base on novel secret agent sequel mi,"[Action, Adventure, Crime]",6.3
3,follow the death of district attorney harvey d...,the legend end,dc comic crime fighter terrorist secret identi...,"[Action, Crime, Drama, Thriller]",7.6
4,john carter be a warweary former military capt...,lose in our world find in another,base on novel mar medallion space travel princess,"[Action, Adventure, Science, Fiction]",6.1


In [None]:
X_train, X_temp, y_train, y_temp, g_train, g_temp = train_test_split(
    df, df["voting_average"], Y_genre,
    test_size=0.30,
    random_state=SEED
)

X_val, X_test, y_val, y_test, g_val, g_test = train_test_split(
    X_temp, y_temp, g_temp,
    test_size=0.50,
    random_state=SEED
)

## Loading GlovVe

In [None]:
EMBED_DIM = 100

zip_path = f"glove.2024.wikigiga.{EMBED_DIM}d.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall("glove")

glove_file = [f for f in os.listdir("glove") if f.endswith(".txt")][0]

embeddings = {}
with open(os.path.join("glove", glove_file), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        if not values:
            continue

        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype="float32")
            if len(vector) == EMBED_DIM:
                embeddings[word] = vector
        except ValueError:
            continue

print("Loaded GloVe vectors:", len(embeddings))

Loaded GloVe vectors: 1287614


## TF-IDF weighting


In [None]:
def build_doc_embeddings(texts, embeddings, embed_dim):
    tfidf = TfidfVectorizer(min_df=3)
    tfidf_matrix = tfidf.fit_transform(texts)
    vocab = tfidf.vocabulary_

    covered = sum(1 for w in vocab if w in embeddings)
    coverage = covered / len(vocab)
    print(f"Embedding coverage: {coverage:.2%}")

    doc_embeddings = []

    for i in range(tfidf_matrix.shape[0]):
        row = tfidf_matrix[i]
        weighted_vec = np.zeros(embed_dim)
        weight_sum = 0

        for word, idx in vocab.items():
            if word in embeddings:
                weight = row[0, idx]
                weighted_vec += weight * embeddings[word]
                weight_sum += weight

        if weight_sum != 0:
            weighted_vec /= weight_sum

        doc_embeddings.append(weighted_vec)

    return np.array(doc_embeddings)

In [None]:
print("\n--- Generating TF-IDF Weighted GloVe Embeddings ---\n")

X_train_tfidf_overview = build_doc_embeddings(X_train["overview"], embeddings, EMBED_DIM)
X_val_tfidf_overview = build_doc_embeddings(X_val["overview"], embeddings, EMBED_DIM)
X_test_tfidf_overview = build_doc_embeddings(X_test["overview"], embeddings, EMBED_DIM)

X_train_tfidf_tagline = build_doc_embeddings(X_train["tagline"], embeddings, EMBED_DIM)
X_val_tfidf_tagline = build_doc_embeddings(X_val["tagline"], embeddings, EMBED_DIM)
X_test_tfidf_tagline = build_doc_embeddings(X_test["tagline"], embeddings, EMBED_DIM)

X_train_tfidf_keywords = build_doc_embeddings(X_train["keywords"], embeddings, EMBED_DIM)
X_val_tfidf_keywords = build_doc_embeddings(X_val["keywords"], embeddings, EMBED_DIM)
X_test_tfidf_keywords = build_doc_embeddings(X_test["keywords"], embeddings, EMBED_DIM)

X_train_tfidf = np.concatenate([X_train_tfidf_overview, X_train_tfidf_tagline, X_train_tfidf_keywords], axis=1)
X_val_tfidf = np.concatenate([X_val_tfidf_overview, X_val_tfidf_tagline, X_val_tfidf_keywords], axis=1)
X_test_tfidf = np.concatenate([X_test_tfidf_overview, X_test_tfidf_tagline, X_test_tfidf_keywords], axis=1)

print("TF-IDF weighted GloVe embeddings generated successfully.")
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_val_tfidf shape: {X_val_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")


--- Generating TF-IDF Weighted GloVe Embeddings ---

Embedding coverage: 99.05%
Embedding coverage: 99.65%
Embedding coverage: 99.64%
Embedding coverage: 99.39%
Embedding coverage: 98.96%
Embedding coverage: 98.63%
Embedding coverage: 98.83%
Embedding coverage: 99.10%
Embedding coverage: 99.07%
TF-IDF weighted GloVe embeddings generated successfully.
X_train_tfidf shape: (2631, 300)
X_val_tfidf shape: (564, 300)
X_test_tfidf shape: (564, 300)


In [None]:
def build_doc_embeddings_glove_only(texts, embeddings, embed_dim):
    doc_embeddings = []

    for text in texts:
        words = text.split()
        vectors = [embeddings[w] for w in words if w in embeddings]

        if len(vectors) == 0:
            doc_embeddings.append(np.zeros(embed_dim))
        else:
            doc_embeddings.append(np.mean(vectors, axis=0))

    return np.array(doc_embeddings)

In [None]:
print("\n--- Generating GloVe Only Embeddings ---\n")

X_train_glove_overview = build_doc_embeddings_glove_only(X_train["overview"], embeddings, EMBED_DIM)
X_val_glove_overview = build_doc_embeddings_glove_only(X_val["overview"], embeddings, EMBED_DIM)
X_test_glove_overview = build_doc_embeddings_glove_only(X_test["overview"], embeddings, EMBED_DIM)

X_train_glove_tagline = build_doc_embeddings_glove_only(X_train["tagline"], embeddings, EMBED_DIM)
X_val_glove_tagline = build_doc_embeddings_glove_only(X_val["tagline"], embeddings, EMBED_DIM)
X_test_glove_tagline = build_doc_embeddings_glove_only(X_test["tagline"], embeddings, EMBED_DIM)

X_train_glove_keywords = build_doc_embeddings_glove_only(X_train["keywords"], embeddings, EMBED_DIM)
X_val_glove_keywords = build_doc_embeddings_glove_only(X_val["keywords"], embeddings, EMBED_DIM)
X_test_glove_keywords = build_doc_embeddings_glove_only(X_test["keywords"], embeddings, EMBED_DIM)

X_train_glove = np.concatenate([X_train_glove_overview, X_train_glove_tagline, X_train_glove_keywords], axis=1)
X_val_glove = np.concatenate([X_val_glove_overview, X_val_glove_tagline, X_val_glove_keywords], axis=1)
X_test_glove = np.concatenate([X_test_glove_overview, X_test_glove_tagline, X_test_glove_keywords], axis=1)

print("GloVe-only embeddings generated successfully.")
print(f"X_train_glove shape: {X_train_glove.shape}")
print(f"X_val_glove shape: {X_val_glove.shape}")
print(f"X_test_glove shape: {X_test_glove.shape}")


--- Generating GloVe Only Embeddings ---

GloVe-only embeddings generated successfully.
X_train_glove shape: (2631, 300)
X_val_glove shape: (564, 300)
X_test_glove shape: (564, 300)


## Regressor

In [83]:
class RatingRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze()

In [84]:
def train_regressor(X_train, y_train, X_test, y_test):
    model = RatingRegressor(X_train.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test.values, dtype=torch.float32)

    for epoch in range(30):
        optimizer.zero_grad()
        preds = model(X_train)
        loss = loss_fn(preds, y_train)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        preds = model(X_test)
        mse = mean_squared_error(y_test.numpy(), preds.numpy())
        rmse = np.sqrt(mse)

    return mse, rmse

In [85]:
print("\n--- Calculating Regression Baseline (Global Mean Rating) ---\n")

global_mean_rating = y_train.mean()

baseline_preds_val = np.full_like(y_val, global_mean_rating)

baseline_mse_val = mean_squared_error(y_val, baseline_preds_val)
baseline_rmse_val = np.sqrt(baseline_mse_val)

print("Regression Baseline Results (Global Mean Rating on Validation Set):")
print(f"  Global Mean Rating (from training data): {global_mean_rating:.4f}")
print(f"  Baseline MSE (Validation): {baseline_mse_val:.4f}")
print(f"  Baseline RMSE (Validation): {baseline_rmse_val:.4f}")



--- Calculating Regression Baseline (Global Mean Rating) ---

Regression Baseline Results (Global Mean Rating on Validation Set):
  Global Mean Rating (from training data): 6.2339
  Baseline MSE (Validation): 0.7780
  Baseline RMSE (Validation): 0.8820


### Glove only


In [89]:

mse_ov, rmse_ov = train_regressor(
    X_train_glove_overview, y_train, X_test_glove_overview, y_test
)

print("Overview only:")
print(f"MSE: {mse_ov:.4f}, RMSE: {rmse_ov:.4f}")

mse_tg, rmse_tg = train_regressor(
    X_train_glove_tagline, y_train, X_test_glove_tagline, y_test
)

print("Tagline only:")
print(f"MSE: {mse_tg:.4f}, RMSE: {rmse_tg:.4f}")


mse_kw, rmse_kw = train_regressor(
    X_train_glove_keywords, y_train, X_test_glove_keywords, y_test
)

print("Keywords only:")
print(f"MSE: {mse_kw:.4f}, RMSE: {rmse_kw:.4f}")

Overview only:
MSE: 5.6593, RMSE: 2.3789
Tagline only:
MSE: 3.2568, RMSE: 1.8047
Keywords only:
MSE: 7.5196, RMSE: 2.7422


### With TFIDG weighting


In [90]:
print("\n--- Regression: Overview (TF-IDF + GloVe) ---\n")

mse_ov, rmse_ov = train_regressor(
    X_train_tfidf_overview, y_train,
    X_test_tfidf_overview, y_test
)

print(f"Overview -> MSE: {mse_ov:.4f}, RMSE: {rmse_ov:.4f}")

print("\n--- Regression: Tagline (TF-IDF + GloVe) ---\n")

mse_tg, rmse_tg = train_regressor(
    X_train_tfidf_tagline, y_train,
    X_test_tfidf_tagline, y_test
)

print(f"Tagline -> MSE: {mse_tg:.4f}, RMSE: {rmse_tg:.4f}")

print("\n--- Regression: Keywords (TF-IDF + GloVe) ---\n")

mse_kw, rmse_kw = train_regressor(
    X_train_tfidf_keywords, y_train,
    X_test_tfidf_keywords, y_test
)

print(f"Keywords -> MSE: {mse_kw:.4f}, RMSE: {rmse_kw:.4f}")


--- Regression: Overview (TF-IDF + GloVe) ---

Overview -> MSE: 5.9614, RMSE: 2.4416

--- Regression: Tagline (TF-IDF + GloVe) ---

Tagline -> MSE: 3.0571, RMSE: 1.7484

--- Regression: Keywords (TF-IDF + GloVe) ---

Keywords -> MSE: 6.7466, RMSE: 2.5974


## Classifier

In [None]:
class GenreClassifier(nn.Module):
    def __init__(self, dim, num_labels):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 256),
            nn.ReLU(),
            nn.Linear(256, num_labels)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
def train_genre_classifier(X_tr, y_tr, X_te, y_te):
    model = GenreClassifier(X_tr.shape[1], y_tr.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()

    X_tr = torch.tensor(X_tr, dtype=torch.float32)
    y_tr = torch.tensor(y_tr, dtype=torch.float32)
    X_te = torch.tensor(X_te, dtype=torch.float32)

    print("\nStarting Classifier Training...")
    for epoch in range(30):
        optimizer.zero_grad()
        logits = model(X_tr)
        loss = loss_fn(logits, y_tr)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"  Epoch {epoch+1:2d}/30, Loss: {loss.item():.4f}")

    preds = torch.sigmoid(model(X_te)).detach().numpy() > 0.5

    micro = f1_score(y_te, preds, average="micro")
    macro = f1_score(y_te, preds, average="macro")
    hamming = hamming_loss(y_te, preds)
    jaccard = jaccard_score(y_te, preds, average="samples")

    return micro, macro, hamming, jaccard

In [92]:
print("\n--- Genre Classification: Overview (GloVe Only) ---\n")

micro_ov_g, macro_ov_g, hamming_ov_g, jaccard_ov_g = train_genre_classifier(
    X_train_glove_overview, g_train,
    X_test_glove_overview, g_test
)

print("Overview Results (GloVe Only):")
print(f"  F1 Micro    : {micro_ov_g:.4f}")
print(f"  F1 Macro    : {macro_ov_g:.4f}")
print(f"  Hamming Loss: {hamming_ov_g:.4f}")
print(f"  Jaccard     : {jaccard_ov_g:.4f}")

print("\n--- Genre Classification: Tagline (GloVe Only) ---\n")

micro_tg_g, macro_tg_g, hamming_tg_g, jaccard_tg_g = train_genre_classifier(
    X_train_glove_tagline, g_train,
    X_test_glove_tagline, g_test
)

print("Tagline Results (GloVe Only):")
print(f"  F1 Micro    : {micro_tg_g:.4f}")
print(f"  F1 Macro    : {macro_tg_g:.4f}")
print(f"  Hamming Loss: {hamming_tg_g:.4f}")
print(f"  Jaccard     : {jaccard_tg_g:.4f}")

print("\n--- Genre Classification: Keywords (GloVe Only) ---\n")

micro_kw_g, macro_kw_g, hamming_kw_g, jaccard_kw_g = train_genre_classifier(
    X_train_glove_keywords, g_train,
    X_test_glove_keywords, g_test
)

print("Keywords Results (GloVe Only):")
print(f"  F1 Micro    : {micro_kw_g:.4f}")
print(f"  F1 Macro    : {macro_kw_g:.4f}")
print(f"  Hamming Loss: {hamming_kw_g:.4f}")
print(f"  Jaccard     : {jaccard_kw_g:.4f}")


--- Genre Classification: Overview (GloVe Only) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.6978
  Epoch  5/30, Loss: 0.6387
  Epoch 10/30, Loss: 0.5642
  Epoch 15/30, Loss: 0.4840
  Epoch 20/30, Loss: 0.4091
  Epoch 25/30, Loss: 0.3568
  Epoch 30/30, Loss: 0.3314
Overview Results (GloVe Only):
  F1 Micro    : 0.0026
  F1 Macro    : 0.0007
  Hamming Loss: 0.1256
  Jaccard     : 0.0027

--- Genre Classification: Tagline (GloVe Only) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.7025
  Epoch  5/30, Loss: 0.6334
  Epoch 10/30, Loss: 0.5428


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Epoch 15/30, Loss: 0.4493
  Epoch 20/30, Loss: 0.3751
  Epoch 25/30, Loss: 0.3382
  Epoch 30/30, Loss: 0.3273
Tagline Results (GloVe Only):
  F1 Micro    : 0.0728
  F1 Macro    : 0.0162
  Hamming Loss: 0.1231
  Jaccard     : 0.0552

--- Genre Classification: Keywords (GloVe Only) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.7028
  Epoch  5/30, Loss: 0.6387
  Epoch 10/30, Loss: 0.5566


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Epoch 15/30, Loss: 0.4703
  Epoch 20/30, Loss: 0.3944
  Epoch 25/30, Loss: 0.3467
  Epoch 30/30, Loss: 0.3262
Keywords Results (GloVe Only):
  F1 Micro    : 0.1511
  F1 Macro    : 0.0290
  Hamming Loss: 0.1205
  Jaccard     : 0.1156


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [91]:
print("\n--- Genre Classification: Overview (TF-IDF + GloVe) ---\n")

micro_ov, macro_ov, hamming_ov, jaccard_ov = train_genre_classifier(
    X_train_tfidf_overview, g_train,
    X_test_tfidf_overview, g_test
)

print("Overview Results:")
print(f"  F1 Micro    : {micro_ov:.4f}")
print(f"  F1 Macro    : {macro_ov:.4f}")
print(f"  Hamming Loss: {hamming_ov:.4f}")
print(f"  Jaccard     : {jaccard_ov:.4f}")

print("\n--- Genre Classification: Tagline (TF-IDF + GloVe) ---\n")

micro_tg, macro_tg, hamming_tg, jaccard_tg = train_genre_classifier(
    X_train_tfidf_tagline, g_train,
    X_test_tfidf_tagline, g_test
)

print("Tagline Results:")
print(f"  F1 Micro    : {micro_tg:.4f}")
print(f"  F1 Macro    : {macro_tg:.4f}")
print(f"  Hamming Loss: {hamming_tg:.4f}")
print(f"  Jaccard     : {jaccard_tg:.4f}")

print("\n--- Genre Classification: Keywords (TF-IDF + GloVe) ---\n")

micro_kw, macro_kw, hamming_kw, jaccard_kw = train_genre_classifier(
    X_train_tfidf_keywords, g_train,
    X_test_tfidf_keywords, g_test
)

print("Keywords Results:")
print(f"  F1 Micro    : {micro_kw:.4f}")
print(f"  F1 Macro    : {macro_kw:.4f}")
print(f"  Hamming Loss: {hamming_kw:.4f}")
print(f"  Jaccard     : {jaccard_kw:.4f}")


--- Genre Classification: Overview (TF-IDF + GloVe) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.6876
  Epoch  5/30, Loss: 0.6354
  Epoch 10/30, Loss: 0.5636
  Epoch 15/30, Loss: 0.4819
  Epoch 20/30, Loss: 0.4044
  Epoch 25/30, Loss: 0.3514
  Epoch 30/30, Loss: 0.3285
Overview Results:
  F1 Micro    : 0.0214
  F1 Macro    : 0.0055
  Hamming Loss: 0.1251
  Jaccard     : 0.0158

--- Genre Classification: Tagline (TF-IDF + GloVe) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.7038
  Epoch  5/30, Loss: 0.6360
  Epoch 10/30, Loss: 0.5510


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Epoch 15/30, Loss: 0.4624
  Epoch 20/30, Loss: 0.3885
  Epoch 25/30, Loss: 0.3472
  Epoch 30/30, Loss: 0.3313
Tagline Results:
  F1 Micro    : 0.0375
  F1 Macro    : 0.0094
  Hamming Loss: 0.1240
  Jaccard     : 0.0297

--- Genre Classification: Keywords (TF-IDF + GloVe) ---


Starting Classifier Training...
  Epoch  1/30, Loss: 0.6941
  Epoch  5/30, Loss: 0.6312
  Epoch 10/30, Loss: 0.5489


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Epoch 15/30, Loss: 0.4619
  Epoch 20/30, Loss: 0.3880
  Epoch 25/30, Loss: 0.3451
  Epoch 30/30, Loss: 0.3275
Keywords Results:
  F1 Micro    : 0.1676
  F1 Macro    : 0.0260
  Hamming Loss: 0.1241
  Jaccard     : 0.1332


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## genre word analysis

In [None]:
from collections import Counter
import pandas as pd

def frequent_words_per_genre(df, text_col, min_freq=3, top_k=10):
    results = {}

    for genre in mlb.classes_:
        texts = df[df["genres"].apply(lambda x: genre in x)][text_col]

        counter = Counter()
        for text in texts:
            counter.update(text.split())

        filtered = {w: c for w, c in counter.items() if c >= min_freq}

        sorted_words = sorted(filtered.items(), key=lambda x: x[1], reverse=True)

        top_words = sorted_words[:top_k]
        bottom_words = sorted_words[-top_k:]

        results[genre] = {
            "top_words": top_words,
            "bottom_words": bottom_words
        }

    return results

In [None]:
print("\n--- Genre Word Analysis for Overview ---\n")
overview_genre_analysis = genre_word_analysis(df, "overview")
for genre, data in overview_genre_analysis.items():
    print(f"Genre: {genre}")
    print(f"  Top words: {data['top_words']}")
    print(f"  Bottom words: {data['bottom_words']}\n")


--- Genre Word Analysis for Overview ---

Genre: Action
  Top words: ['the' 'to' 'of' 'and' 'be' 'his' 'in' 'on' 'with' 'he']
  Bottom words: ['claus' 'cliff' 'clifford' 'clint' 'clique' 'owl' 'participant'
 'wellbeing' 'claire' 'claudia']

Genre: Adventure
  Top words: ['the' 'to' 'and' 'of' 'be' 'his' 'in' 'with' 'on' 'an']
  Bottom words: ['crave' 'mama' 'filmed' 'finch' 'creed' 'creepy' 'critic' 'critical'
 'crooked' 'crucifixion']

Genre: Animation
  Top words: ['the' 'and' 'to' 'be' 'of' 'his' 'in' 'he' 'an' 'when']
  Bottom words: ['ire' 'slow' 'slip' 'slim' 'slightly' 'slide' 'investor' 'investment'
 'investigator' 'irresponsible']

Genre: Comedy
  Top words: ['the' 'to' 'and' 'be' 'of' 'his' 'in' 'with' 'her' 'he']
  Bottom words: ['voorhees' 'maryland' 'powell' 'powerless' 'coerce' 'enforcement'
 'enforcer' 'engulf' 'enigmatic' 'vader']

Genre: Crime
  Top words: ['the' 'to' 'of' 'and' 'his' 'be' 'in' 'he' 'with' 'for']
  Bottom words: ['patty' 'fabled' 'facility' 'fade' 'fa

In [None]:
print("\n--- Genre Word Analysis for Tagline ---\n")
tagline_genre_analysis = genre_word_analysis(df, "tagline")
for genre, data in tagline_genre_analysis.items():
    print(f"Genre: {genre}")
    print(f"  Top words: {data['top_words']}")
    print(f"  Bottom words: {data['bottom_words']}\n")


--- Genre Word Analysis for Tagline ---

Genre: Action
  Top words: ['the' 'be' 'of' 'to' 'no' 'have' 'for' 'in' 'it' 'one']
  Bottom words: ['page' 'crowd' 'president' 'present' 'possible' 'possibility'
 'popularity' 'pop' 'player' 'protection']

Genre: Adventure
  Top words: ['the' 'be' 'of' 'adventure' 'to' 'it' 'for' 'one' 'have' 'world']
  Bottom words: ['promise' 'relationship' 'everyones' 'everywhere' 'except' 'experiment'
 'explain' 'explode' 'explosive' 'pretty']

Genre: Animation
  Top words: ['the' 'be' 'adventure' 'to' 'get' 'it' 'of' 'for' 'little' 'your']
  Bottom words: ['suddenly' 'teacher' 'stop' 'storm' 'straight' 'stranger' 'street'
 'strength' 'strong' 'student']

Genre: Comedy
  Top words: ['the' 'be' 'to' 'it' 'get' 'of' 'you' 'in' 'he' 'and']
  Bottom words: ['anywhere' 'army' 'unite' 'unspeakable' 'train' 'untold' 'travel'
 'temptation' 'era' 'eternity']

Genre: Crime
  Top words: ['be' 'the' 'you' 'to' 'no' 'in' 'of' 'it' 'get' 'for']
  Bottom words: ['princes

In [None]:
print("\n--- Genre Word Analysis for Keywords ---\n")
keywords_genre_analysis = genre_word_analysis(df, "keywords")
for genre, data in keywords_genre_analysis.items():
    print(f"Genre: {genre}")
    print(f"  Top words: {data['top_words']}")
    print(f"  Bottom words: {data['bottom_words']}\n")


--- Genre Word Analysis for Keywords ---

Genre: Action
  Top words: ['dystopia' 'comic' 'on' 'war' 'base' 'secret' 'of' 'art' 'martial'
 'assassin']
  Bottom words: ['kennedy' 'orleans' 'onenight' 'old' 'ohio' 'occult' 'immigrant'
 'imaginary' 'daydream' 'day']

Genre: Adventure
  Top words: ['on' 'base' 'comic' 'of' 'england' 'secret' 'magic' 'the' 'alien' 'novel']
  Bottom words: ['last' 'dress' 'dominatrix' 'domestic' 'kennedy' 'leave' 'leader' 'lead'
 'laxative' 'law']

Genre: Animation
  Top words: ['animation' 'magic' 'alien' 'musical' 'relationship' 'animal' 'father'
 'duringcreditsstinger' 'aftercreditsstinger' 'on']
  Bottom words: ['newspaper' 'nun' 'number' 'nudity' 'nuclear' 'northern' 'north' 'noise'
 'nixon' 'ninja']

Genre: Comedy
  Top words: ['film' 'independent' 'relationship' 'duringcreditsstinger' 'love' 'woman'
 'comedy' 'new' 'sex' 'director']
  Bottom words: ['cinema' 'gladiator' 'survivor' 'survival' 'superhuman' 'rio' 'gift'
 'greek' 'graphic' 'civilization']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def genre_indicative_words(df, text_col, top_k=10):
    tfidf = TfidfVectorizer(min_df=3)
    X = tfidf.fit_transform(df[text_col])
    vocab = np.array(tfidf.get_feature_names_out())

    results = {}

    for genre in mlb.classes_:
        # Binary labels for one-vs-rest
        y = df["genres"].apply(lambda x: genre in x).astype(int)

        clf = LogisticRegression(max_iter=1000)
        clf.fit(X, y)

        # Coefficients
        coef = clf.coef_[0]

        top_indices = np.argsort(coef)[-top_k:]
        top_words = vocab[top_indices]

        results[genre] = top_words

    return results

In [None]:
print("\n--- Genre-Indicative Words Using TF-IDF ---\n")
indicative_results = genre_indicative_words(df, "overview")

for genre, words in indicative_results.items():
    print(f"Genre: {genre}")
    print("  Indicative words:", words)
    print()


--- Genre-Indicative Words Using TF-IDF ---

Genre: Action
  Indicative words: ['warrior' 'hero' 'cia' 'criminal' 'the' 'target' 'battle' 'assassin'
 'cop' 'agent']

Genre: Adventure
  Indicative words: ['dinosaur' 'destroy' 'find' 'warrior' 'mission' 'power' 'world' 'the'
 'bond' 'adventure']

Genre: Animation
  Indicative words: ['accidentally' 'dream' 'dinosaur' 'shrek' 'when' 'and' 'human' 'penguin'
 'world' 'adventure']

Genre: Comedy
  Indicative words: ['doesnt' 'wedding' 'christmas' 'date' 'big' 'all' 'show' 'guy' 'up'
 'comedy']

Genre: Crime
  Indicative words: ['mob' 'killer' 'mafia' 'fbi' 'criminal' 'cop' 'drug' 'detective' 'police'
 'murder']

Genre: Documentary
  Indicative words: ['of' 'me' 'michael' 'filmmaker' 'interview' 'and' 'footage' 'film' 'the'
 'documentary']

Genre: Drama
  Indicative words: ['family' 'her' 'change' 'rise' 'his' 'drama' 'love' 'wife' 'life' 'story']

Genre: Family
  Indicative words: ['animal' 'christmas' 'name' 'up' 'world' 'save' 'land' 'boy

In [None]:
print("\n--- Genre-Indicative Words Using TF-IDF ---\n")
indicative_results = genre_indicative_words(df, "tagline")

for genre, words in indicative_results.items():
    print(f"Genre: {genre}")
    print("  Indicative words:", words)
    print()


--- Genre-Indicative Words Using TF-IDF ---

Genre: Action
  Indicative words: ['mission' 'enemy' 'bond' 'beginning' 'protect' 'kill' 'action'
 'vengeance' 'hero' 'cop']

Genre: Adventure
  Indicative words: ['earth' 'james' 'danger' 'bear' 'wild' 'legend' 'bond' 'beginning' 'hero'
 'adventure']

Genre: Animation
  Indicative words: ['save' 'season' 'great' 'grave' 'tale' 'toy' 'fairy' 'wish' 'little'
 'adventure']

Genre: Comedy
  Indicative words: ['big' 'movie' 'hit' 'put' 'funny' 'little' 'shes' 'party' 'get' 'comedy']

Genre: Crime
  Indicative words: ['clean' 'money' 'cop' 'murder' 'sin' 'killer' 'deadly' 'criminal' 'law'
 'crime']

Genre: Documentary
  Indicative words: ['day' 'of' 'on' 'foot' 'up' 'home' 'in' 'at' 'what' 'film']

Genre: Drama
  Indicative words: ['shot' 'hope' 'sometimes' 'dream' 'how' 'honor' 'love' 'she' 'story'
 'life']

Genre: Family
  Indicative words: ['summer' 'wish' 'world' 'toy' 'neighborhood' 'friend' 'proportion'
 'jungle' 'little' 'adventure']

Gen

In [None]:
print("\n--- Genre-Indicative Words Using TF-IDF ---\n")
indicative_results = genre_indicative_words(df, "keywords")

for genre, words in indicative_results.items():
    print(f"Genre: {genre}")
    print("  Indicative words:", words)
    print()


--- Genre-Indicative Words Using TF-IDF ---

Genre: Action
  Indicative words: ['cop' 'marvel' 'disaster' 'agent' 'helicopter' 'hero' 'dystopia'
 'assassin' 'martial' 'comic']

Genre: Adventure
  Indicative words: ['england' 'egypt' 'hero' 'ocean' 'adventure' 'liberation' 'treasure'
 'comic' 'magic' 'marvel']

Genre: Animation
  Indicative words: ['fish' 'stop' 'alien' 'animal' 'sea' 'bunny' 'penguin' 'magic' 'bird'
 'animation']

Genre: Comedy
  Indicative words: ['birth' 'aftercreditsstinger' 'dog' 'date' 'high' 'christmas' 'holiday'
 'friendship' 'spoof' 'comedy']

Genre: Crime
  Indicative words: ['agent' 'hitman' 'undercover' 'prison' 'detective' 'crime' 'murder'
 'drug' 'fbi' 'police']

Genre: Documentary
  Indicative words: ['pain' 'stuntman' 'concert' 'usa' 'stunt' 'penguin' 'corruption' 'food'
 'director' 'music']

Genre: Drama
  Indicative words: ['novel' 'obsession' 'individual' 'depression' 'love' 'suicide' 'adultery'
 'rape' 'war' 'biography']

Genre: Family
  Indicative 