In [1]:

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip




--2026-02-20 11:28:41--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-02-20 11:28:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-02-20 11:28:42--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
 #  TASK:-1

In [10]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split


df = pd.read_csv('/content/movies - movies.csv')


allowed_cols = ['overview', 'tagline', 'keywords', 'genres', 'vote_average']
df = df[allowed_cols]

print(f"Initial dataset shape: {df.shape}")

Initial dataset shape: (4803, 5)


In [11]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    return " ".join(tokens)

text_columns = ['overview', 'tagline', 'keywords']
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

print("Preprocessing complete. Example overview:")
print(df['overview'].iloc[0][:100] + "...")

Preprocessing complete. Example overview:
in the nd century a paraplegic marine is dispatched to the moon pandora on a unique mission but beco...


In [12]:

train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42
)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 3362
Validation set size: 720
Test set size: 721


In [None]:
# TASK:-2

In [13]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict


glove_path = '/content/glove.6B.100d.txt'
glove_index = load_glove_embeddings(glove_path)
print(f"Loaded {len(glove_index)} word vectors.")

Loaded 400000 word vectors.


In [14]:
def check_coverage(df, text_col, embeddings):
    all_tokens = " ".join(df[text_col]).split()
    unique_tokens = set(all_tokens)
    covered = [w for w in unique_tokens if w in embeddings]

    coverage_pct = (len(covered) / len(unique_tokens)) * 100
    print(f"Coverage for '{text_col}': {len(covered)}/{len(unique_tokens)} ({coverage_pct:.2f}%)")
    return coverage_pct


coverage = check_coverage(train_df, 'overview', glove_index)

Coverage for 'overview': 17304/19226 (90.00%)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf.fit(train_df['overview'])
feature_names = tfidf.get_feature_names_out()
word_to_tfidf_idx = {word: i for i, word in enumerate(feature_names)}

def get_weighted_doc_embedding(text, embeddings, tfidf_model, word_map):
    tokens = text.split()
    if not tokens:
        return np.zeros(100)


    tfidf_vector = tfidf_model.transform([text]).toarray()[0]

    vectors = []
    weights = []

    for word in tokens:
        if word in embeddings and word in word_map:
            vectors.append(embeddings[word])

            weights.append(tfidf_vector[word_map[word]])

    if not vectors or sum(weights) == 0:

        available_vectors = [embeddings[w] for w in tokens if w in embeddings]
        return np.mean(available_vectors, axis=0) if available_vectors else np.zeros(100)


    return np.average(vectors, axis=0, weights=weights)

X_train = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in train_df['overview']])
X_val = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in val_df['overview']])
X_test = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in test_df['overview']])

print(f"Final feature shape: {X_train.shape}")

Final feature shape: (3362, 100)


In [None]:
# TASK:-3

In [17]:
import torch.nn as nn
import torch.optim as optim

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_reg = torch.tensor(train_df['vote_average'].values, dtype=torch.float32).view(-1, 1)
y_test_reg = torch.tensor(test_df['vote_average'].values, dtype=torch.float32).view(-1, 1)

In [18]:
class RatingRegressor(nn.Module):
    def __init__(self):
        super(RatingRegressor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(100, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.network(x)

model_a = RatingRegressor()
criterion = nn.MSELoss()
optimizer = optim.Adam(model_a.parameters(), lr=0.001)

In [19]:

global_mean = train_df['vote_average'].mean()
baseline_preds = np.full(y_test_reg.shape, global_mean)
baseline_mse = mean_squared_error(y_test_reg, baseline_preds)
print(f"Baseline MSE: {baseline_mse:.4f}")
print(f"Baseline RMSE: {np.sqrt(baseline_mse):.4f}")

epochs = 50
for epoch in range(epochs):
    model_a.train()
    optimizer.zero_grad()
    outputs = model_a(X_train_tensor)
    loss = criterion(outputs, y_train_reg)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Baseline MSE: 1.2825
Baseline RMSE: 1.1325
Epoch [10/50], Loss: 33.8949
Epoch [20/50], Loss: 28.1795
Epoch [30/50], Loss: 18.6104
Epoch [40/50], Loss: 7.1498
Epoch [50/50], Loss: 2.2868


In [20]:
model_a.eval()
with torch.no_grad():
    test_outputs = model_a(X_test_tensor)
    mse = criterion(test_outputs, y_test_reg).item()
    rmse = np.sqrt(mse)

print(f"\nModel A (Overview) Results:")
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")


Model A (Overview) Results:
Test MSE: 2.2983
Test RMSE: 1.5160


In [None]:
# TASK:-4

In [35]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, jaccard_score

def parse_genres(genre_str):
    if not isinstance(genre_str, str) or genre_str.strip() == "":
        return []
    try:
        genres_list = ast.literal_eval(genre_str)
        return [genre['name'] for genre in genres_list if 'name' in genre]
    except (ValueError, SyntaxError, TypeError):
        return []

train_df['parsed_genres'] = train_df['genres'].apply(parse_genres)
test_df['parsed_genres'] = test_df['genres'].apply(parse_genres)

non_empty_parsed_train = train_df[train_df['parsed_genres'].apply(len) > 0]['parsed_genres']
if not non_empty_parsed_train.empty:
    print(f"Example of parsed genres (train): {non_empty_parsed_train.iloc[0]}")
else:
    print("No genres were successfully parsed in the training set.")

mlb = MultiLabelBinarizer()

y_train_gen = mlb.fit_transform(train_df['parsed_genres'])


y_test_gen = mlb.transform(test_df['parsed_genres'])


print(f"Total unique genres: {len(mlb.classes_)}")
print(f"Shape of y_train_gen: {y_train_gen.shape}")
print(f"Shape of y_test_gen: {y_test_gen.shape}")

No genres were successfully parsed in the training set.
Total unique genres: 0
Shape of y_train_gen: (3362, 0)
Shape of y_test_gen: (721, 0)


In [36]:
import pandas as pd

print("Sample of raw 'genres' column from df:")
print(df['genres'].head(10))


print("\nValue counts for 'genres' column types:")
print(df['genres'].apply(type).value_counts())

print("\nNumber of NaN values in 'genres' column:")
print(df['genres'].isnull().sum())

Sample of raw 'genres' column from df:
0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
2                      Action Adventure Crime
3                 Action Crime Drama Thriller
4            Action Adventure Science Fiction
5                    Fantasy Action Adventure
6                            Animation Family
7            Action Adventure Science Fiction
8                    Adventure Fantasy Family
9                    Action Adventure Fantasy
Name: genres, dtype: object

Value counts for 'genres' column types:
genres
<class 'str'>      4775
<class 'float'>      28
Name: count, dtype: int64

Number of NaN values in 'genres' column:
28


In [37]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, jaccard_score


def parse_genres(genre_str):
    if not isinstance(genre_str, str) or genre_str.strip() == "":
        return []
    return genre_str.split()


train_df['parsed_genres'] = train_df['genres'].apply(parse_genres)
test_df['parsed_genres'] = test_df['genres'].apply(parse_genres)


non_empty_parsed_train = train_df[train_df['parsed_genres'].apply(len) > 0]['parsed_genres']
if not non_empty_parsed_train.empty:
    print(f"Example of parsed genres (train): {non_empty_parsed_train.iloc[0]}")
else:
    print("No genres were successfully parsed in the training set.")


mlb = MultiLabelBinarizer()

y_train_gen = mlb.fit_transform(train_df['parsed_genres'])

y_test_gen = mlb.transform(test_df['parsed_genres'])


print(f"Total unique genres: {len(mlb.classes_)}")
print(f"Shape of y_train_gen: {y_train_gen.shape}")
print(f"Shape of y_test_gen: {y_test_gen.shape}")

Example of parsed genres (train): ['Mystery', 'Thriller']
Total unique genres: 22
Shape of y_train_gen: (3362, 22)
Shape of y_test_gen: (721, 22)


In [38]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_tagline = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in train_df['tagline']])
X_val_tagline = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in val_df['tagline']])
X_test_tagline = np.array([get_weighted_doc_embedding(t, glove_index, tfidf, word_to_tfidf_idx) for t in test_df['tagline']])

print(f"Tagline embeddings for training set shape: {X_train_tagline.shape}")
print(f"Tagline embeddings for validation set shape: {X_val_tagline.shape}")
print(f"Tagline embeddings for test set shape: {X_test_tagline.shape}")

Tagline embeddings for training set shape: (3362, 100)
Tagline embeddings for validation set shape: (720, 100)
Tagline embeddings for test set shape: (721, 100)


In [39]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss
import numpy as np

def train_and_evaluate_multilabel_model(X_train_data, y_train_data, X_test_data, y_test_data, model_name):
    print(f"\n--- Training and Evaluating Model: {model_name} ---")


    model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42))


    model.fit(X_train_data, y_train_data)

    y_pred = model.predict(X_test_data)


    micro_f1 = f1_score(y_test_data, y_pred, average='micro')
    macro_f1 = f1_score(y_test_data, y_pred, average='macro')
    h_loss = hamming_loss(y_test_data, y_pred)

    print(f"Micro-F1 Score: {micro_f1:.4f}")
    print(f"Macro-F1 Score: {macro_f1:.4f}")
    print(f"Hamming Loss: {h_loss:.4f}")

    return {
        'model_name': model_name,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'hamming_loss': h_loss
    }

results_overview = train_and_evaluate_multilabel_model(
    X_train, y_train_gen, X_test, y_test_gen, 'Overview Embeddings'
)


results_tagline = train_and_evaluate_multilabel_model(
    X_train_tagline, y_train_gen, X_test_tagline, y_test_gen, 'Tagline Embeddings'
)

print("\n--- Comparison of Models ---")
print(f"Overview Embeddings - Micro-F1: {results_overview['micro_f1']:.4f}, Macro-F1: {results_overview['macro_f1']:.4f}, Hamming Loss: {results_overview['hamming_loss']:.4f}")
print(f"Tagline Embeddings - Micro-F1: {results_tagline['micro_f1']:.4f}, Macro-F1: {results_tagline['macro_f1']:.4f}, Hamming Loss: {results_tagline['hamming_loss']:.4f}")

if results_overview['micro_f1'] > results_tagline['micro_f1']:
    print("\nModel with 'Overview Embeddings' performs better based on Micro-F1 score.")
elif results_tagline['micro_f1'] > results_overview['micro_f1']:
    print("\nModel with 'Tagline Embeddings' performs better based on Micro-F1 score.")
else:
    print("\nBoth models perform similarly based on Micro-F1 score.")


--- Training and Evaluating Model: Overview Embeddings ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Micro-F1 Score: 0.4773
Macro-F1 Score: 0.2999
Hamming Loss: 0.1018

--- Training and Evaluating Model: Tagline Embeddings ---
Micro-F1 Score: 0.3011
Macro-F1 Score: 0.1153
Hamming Loss: 0.1168

--- Comparison of Models ---
Overview Embeddings - Micro-F1: 0.4773, Macro-F1: 0.2999, Hamming Loss: 0.1018
Tagline Embeddings - Micro-F1: 0.3011, Macro-F1: 0.1153, Hamming Loss: 0.1168

Model with 'Overview Embeddings' performs better based on Micro-F1 score.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss
import numpy as np

def train_and_evaluate_multilabel_model(X_train_data, y_train_data, X_test_data, y_test_data, model_name):
    print(f"\n--- Training and Evaluating Model: {model_name} ---")


    model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42))


    model.fit(X_train_data, y_train_data)


    y_pred = model.predict(X_test_data)


    micro_f1 = f1_score(y_test_data, y_pred, average='micro', zero_division=0)
    macro_f1 = f1_score(y_test_data, y_pred, average='macro', zero_division=0)
    h_loss = hamming_loss(y_test_data, y_pred)

    print(f"Micro-F1 Score: {micro_f1:.4f}")
    print(f"Macro-F1 Score: {macro_f1:.4f}")
    print(f"Hamming Loss: {h_loss:.4f}")

    return {
        'model_name': model_name,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'hamming_loss': h_loss
    }

results_overview = train_and_evaluate_multilabel_model(
    X_train, y_train_gen, X_test, y_test_gen, 'Overview Embeddings'
)


results_tagline = train_and_evaluate_multilabel_model(
    X_train_tagline, y_train_gen, X_test_tagline, y_test_gen, 'Tagline Embeddings'
)

print("\n--- Comparison of Models ---")
print(f"Overview Embeddings - Micro-F1: {results_overview['micro_f1']:.4f}, Macro-F1: {results_overview['macro_f1']:.4f}, Hamming Loss: {results_overview['hamming_loss']:.4f}")
print(f"Tagline Embeddings - Micro-F1: {results_tagline['micro_f1']:.4f}, Macro-F1: {results_tagline['macro_f1']:.4f}, Hamming Loss: {results_tagline['hamming_loss']:.4f}")

if results_overview['micro_f1'] > results_tagline['micro_f1']:
    print("\nModel with 'Overview Embeddings' performs better based on Micro-F1 score.")
elif results_tagline['micro_f1'] > results_overview['micro_f1']:
    print("\nModel with 'Tagline Embeddings' performs better based on Micro-F1 score.")
else:
    print("\nBoth models perform similarly based on Micro-F1 score.")


--- Training and Evaluating Model: Overview Embeddings ---
Micro-F1 Score: 0.4773
Macro-F1 Score: 0.2999
Hamming Loss: 0.1018

--- Training and Evaluating Model: Tagline Embeddings ---
Micro-F1 Score: 0.3011
Macro-F1 Score: 0.1153
Hamming Loss: 0.1168

--- Comparison of Models ---
Overview Embeddings - Micro-F1: 0.4773, Macro-F1: 0.2999, Hamming Loss: 0.1018
Tagline Embeddings - Micro-F1: 0.3011, Macro-F1: 0.1153, Hamming Loss: 0.1168

Model with 'Overview Embeddings' performs better based on Micro-F1 score.


In [None]:
# TASK:-5

In [41]:
import numpy as np


genre_names = mlb.classes_

genre_words = {genre: [] for genre in genre_names}


for idx, row in train_df.iterrows():

    overview_text = row['overview']


    words = overview_text.split()


    movie_genre_labels = y_train_gen[train_df.index.get_loc(idx)]


    for i, is_genre in enumerate(movie_genre_labels):
        if is_genre == 1:
            genre_name = genre_names[i]
            genre_words[genre_name].extend(words)

print(f"Initialized genre_words dictionary with {len(genre_words)} genres.")

if 'Action' in genre_words:
    print(f"Sample words for 'Action' genre: {genre_words['Action'][:20]}...")
else:
    print("No 'Action' genre found or processed.")

Initialized genre_words dictionary with 22 genres.
Sample words for 'Action' genre: ['a', 'store', 'clerk', 'and', 'an', 'ice', 'cream', 'truck', 'driver', 'are', 'thrown', 'together', 'when', 'a', 'dying', 'scientist', 'entrusts', 'them', 'with', 'a']...


In [42]:
from collections import Counter

genre_word_counts = {}
for genre, words in genre_words.items():
    genre_word_counts[genre] = Counter(words)

print("Top 10 most frequent words per genre:")
print("------------------------------------")
for genre, counts in genre_word_counts.items():
    print(f"\nGenre: {genre}")

    top_words = counts.most_common(10)
    print(f"  Top 10: {top_words}")


    filtered_counts = {word: count for word, count in counts.items() if count >= 3}
    if filtered_counts:

        filtered_counter = Counter(filtered_counts)
        bottom_words = filtered_counter.most_common()[:-11:-1]
        print(f"  Bottom 10 (freq >= 3): {bottom_words}")
    else:
        print("  No words with frequency >= 3 to display for bottom 10.")


Top 10 most frequent words per genre:
------------------------------------

Genre: Action
  Top 10: [('the', 2698), ('a', 1743), ('to', 1443), ('and', 1238), ('of', 1214), ('in', 753), ('his', 715), ('is', 604), ('with', 393), ('an', 361)]
  Bottom 10 (freq >= 3): [('joey', 3), ('doom', 3), ('mystery', 3), ('tongan', 3), ('shuttle', 3), ('serving', 3), ('mérida', 3), ('speed', 3), ('jill', 3), ('kitai', 3)]

Genre: Adventure
  Top 10: [('the', 2028), ('a', 1174), ('to', 1058), ('and', 970), ('of', 860), ('in', 532), ('his', 505), ('is', 377), ('with', 294), ('on', 258)]
  Bottom 10 (freq >= 3): [('melvin', 3), ('arthur', 3), ('doom', 3), ('champion', 3), ('goons', 3), ('tongan', 3), ('chaney', 3), ('fast', 3), ('shuttle', 3), ('potter', 3)]

Genre: Animation
  Top 10: [('the', 678), ('a', 400), ('and', 368), ('to', 349), ('of', 294), ('in', 159), ('his', 150), ('is', 149), ('with', 102), ('he', 89)]
  Bottom 10 (freq >= 3): [('garfield', 3), ('alex', 3), ('grandfather', 3), ('among', 3

In [None]:
# TASK:-6

In [45]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression


lr_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
ovr_classifier = OneVsRestClassifier(lr_model)

ovr_classifier.fit(X_train, y_train_gen)

print("OneVsRestClassifier (Logistic Regression) trained successfully on overview embeddings.")

OneVsRestClassifier (Logistic Regression) trained successfully on overview embeddings.


In [46]:
import numpy as np


feature_names = tfidf.get_feature_names_out()

genre_top_words = {}

for i, genre_name in enumerate(mlb.classes_):
    lr_genre_model = ovr_classifier.estimators_[i]



    word_coef_pairs = list(zip(feature_names, coefficients))

    word_coef_pairs.sort(key=lambda x: x[1], reverse=True)

    top_positive_words = [(word, coef) for word, coef in word_coef_pairs if coef > 0][:10]

    genre_top_words[genre_name] = top_positive_words


print("Top 10 genre-indicative words (positive coefficients) for each genre:")
print("-------------------------------------------------------------------")
for genre, words_with_coefs in genre_top_words.items():
    print(f"\nGenre: {genre}")
    if words_with_coefs:
        for word, coef in words_with_coefs:
            print(f"  - {word}: {coef:.4f}")
    else:
        print("  No strong positive indicative words found.")



Top 10 genre-indicative words (positive coefficients) for each genre:
-------------------------------------------------------------------

Genre: Action
  - ad: 1.9503
  - abused: 1.6766
  - actors: 1.5788
  - abraham: 1.1028
  - abigail: 1.0618
  - accomplish: 0.9446
  - abandoned: 0.9388
  - accuracy: 0.9000
  - access: 0.8631
  - abduct: 0.7877

Genre: Adventure
  - actor: 1.2253
  - acclimated: 1.2189
  - abolitionist: 1.2011
  - accomplish: 1.0759
  - accidentally: 1.0378
  - acclaim: 1.0321
  - acquaintances: 0.9548
  - ada: 0.9020
  - achieved: 0.8317
  - abby: 0.7888

Genre: Animation
  - abolitionist: 1.8438
  - accountant: 1.3388
  - accused: 1.3347
  - accepted: 1.1910
  - ability: 1.1285
  - abraham: 1.1208
  - actor: 1.1186
  - accident: 1.1130
  - accidentally: 1.0810
  - achieves: 0.9977

Genre: Comedy
  - absence: 1.2843
  - accepted: 1.1636
  - abba: 1.1288
  - abaddon: 1.0083
  - addictions: 0.9975
  - ada: 0.9690
  - abusive: 0.9265
  - actor: 0.8723
  - accuracy: 0.