In [2]:
import pandas as pd
df=pd.read_csv('../data/raw/IMDB_four_genre_larger_plot_description.csv')
df.drop('movie_id',axis=1,inplace=True)
df.head()

Unnamed: 0,description,genre
0,Elle Evans (Joey King) has finally completed h...,romance
1,A young girl tries to understand how she myste...,horror
2,"In 1800s England, a well meaning but selfish y...",comedy
3,Abby Holland (Kristen Stewart) and Harper Cald...,romance
4,Olga and Maks are 15 years apart. She is a suc...,romance


In [3]:
Description=df['description']
Description

0      Elle Evans (Joey King) has finally completed h...
1      A young girl tries to understand how she myste...
2      In 1800s England, a well meaning but selfish y...
3      Abby Holland (Kristen Stewart) and Harper Cald...
4      Olga and Maks are 15 years apart. She is a suc...
                             ...                        
995    In front of their little boy, Camille and Geor...
996    After losing his wife and his memory in a car ...
997    Based on the true-life experiences of Dave Fis...
998    A troupe of hilariously self-obsessed theater ...
999    A young mermaid makes a deal with a sea witch ...
Name: description, Length: 1000, dtype: object

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [5]:
nltk.download(['punkt', 'wordnet', 'stopwords', 'punkt_tab'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\testr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\testr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\testr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\testr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
stop_words=stopwords.words('english')

In [7]:
WNL=WordNetLemmatizer()
def preprocess_text(text: str,return_lst=True) -> list:
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text=text.lower()
    tokens=word_tokenize(text)
    lst=[]
    for token in tokens:
        if token not in stop_words:
            token=WNL.lemmatize(token)
            lst.append(token)
    if return_lst:
        return lst
    else:
        return ' '.join(lst)
processed=Description.apply(preprocess_text,return_lst=False)

In [8]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
target=LE.fit_transform(df['genre'])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(processed,target,test_size=0.2,random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf=TfidfVectorizer()
X_train_vector_sparse=Tfidf.fit_transform(X_train)
X_test_vector_sparse=Tfidf.transform(X_test)

In [11]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

classifiers_tfidf = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial NB": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}

# Train and evaluate each classifier
for name, clf in classifiers_tfidf.items():
    print(f"\n{name} Evaluation:")
    
    # Train model
    clf.fit(X_train_vector_sparse, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test_vector_sparse)
    
    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vector_sparse, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


Logistic Regression Evaluation:
Accuracy: 0.7100
Precision: 0.7059
Recall: 0.7100
F1-Score: 0.7070
Cross-val Accuracy: 0.7125 (±0.0319)

Multinomial NB Evaluation:
Accuracy: 0.6950
Precision: 0.7275
Recall: 0.6950
F1-Score: 0.7060
Cross-val Accuracy: 0.6825 (±0.0214)

Random Forest Evaluation:
Accuracy: 0.7200
Precision: 0.7191
Recall: 0.7200
F1-Score: 0.7176
Cross-val Accuracy: 0.6750 (±0.0256)

Decision Tree Evaluation:
Accuracy: 0.4900
Precision: 0.4930
Recall: 0.4900
F1-Score: 0.4724
Cross-val Accuracy: 0.4650 (±0.0357)


# Word2vec

In [13]:
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

In [14]:
from gensim.models import Word2Vec
model_w2v = Word2Vec(
    sentences=X_train_tokens,
    window=5,
    sg=1
)

In [15]:
import numpy as np
def document_vector(doc_tokens: list, model: 'model') ->'embedding':
    vectors = [model.wv[word] for word in doc_tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_vectors_w2v = np.array([document_vector(doc, model_w2v) for doc in X_train_tokens])
X_test_vectors_w2v = np.array([document_vector(doc, model_w2v) for doc in X_test_tokens])

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}
for name, clf in classifiers.items():
    print(f"\n{name} Evaluation:")
    
    # Train model
    clf.fit(X_train_vectors_w2v, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test_vectors_w2v)
    
    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vectors_w2v, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


Logistic Regression Evaluation:
Accuracy: 0.5150
Precision: 0.5214
Recall: 0.5150
F1-Score: 0.5053
Cross-val Accuracy: 0.5150 (±0.0327)

Gaussian NB Evaluation:
Accuracy: 0.4900
Precision: 0.5008
Recall: 0.4900
F1-Score: 0.4875
Cross-val Accuracy: 0.5088 (±0.0400)

Random Forest Evaluation:
Accuracy: 0.5700
Precision: 0.5848
Recall: 0.5700
F1-Score: 0.5755
Cross-val Accuracy: 0.5712 (±0.0447)

Decision Tree Evaluation:
Accuracy: 0.5200
Precision: 0.5261
Recall: 0.5200
F1-Score: 0.5208
Cross-val Accuracy: 0.4900 (±0.0429)


isnt capturing long term dependencies

# testing on prompt

<h3> Tf-Idf

In [17]:
john_wick='With the price on his head ever increasing, legendary hit man John Wick takes his fight against the High Table global as he seeks out the most powerful players in the underworld, from New York to Paris to Japan to Berlin.'
text=preprocess_text(john_wick,return_lst=False)

In [18]:
text_tfidf=Tfidf.transform([text])

In [19]:
classifiers_tfidf['Logistic Regression'].predict(text_tfidf)

array([0])

In [20]:
LE.inverse_transform([0])

array(['action'], dtype=object)

<h3> W2V

In [21]:
embedding_w2v=document_vector(john_wick,model_w2v)
classifiers['Logistic Regression'].predict([embedding_w2v])

array([0])

In [22]:
LE.inverse_transform([0])

array(['action'], dtype=object)

# Pretrained Embeddings for better Representation

<h3> MiniLm

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

X_train_vectors_transformer = model.encode(X_train.tolist()) 
X_test_vectors_transformer = model.encode(X_test.tolist())

  from .autonotebook import tqdm as notebook_tqdm





In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize classifiers
classifiers_transformer_MiniLM = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}
for name, clf in classifiers_transformer_MiniLM.items():
    print(f"\n{name} Evaluation:")
    
    # Train model
    clf.fit(X_train_vectors_transformer, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test_vectors_transformer)
    
    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vectors_transformer, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


Logistic Regression Evaluation:
Accuracy: 0.6950
Precision: 0.6950
Recall: 0.6950
F1-Score: 0.6950
Cross-val Accuracy: 0.7125 (±0.0401)

Gaussian NB Evaluation:
Accuracy: 0.6950
Precision: 0.6932
Recall: 0.6950
F1-Score: 0.6925
Cross-val Accuracy: 0.6887 (±0.0294)

Random Forest Evaluation:
Accuracy: 0.6550
Precision: 0.6631
Recall: 0.6550
F1-Score: 0.6575
Cross-val Accuracy: 0.6713 (±0.0184)

Decision Tree Evaluation:
Accuracy: 0.3900
Precision: 0.3813
Recall: 0.3900
F1-Score: 0.3804
Cross-val Accuracy: 0.4600 (±0.0264)


<h3> MPNET Base V2

It will require raw data for better understanding of the text

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Description, target, test_size=0.2, random_state=42)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

X_train_vectors = model.encode(
    X_train.tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True,
    truncate_dim=384
)
X_test_vectors = model.encode(
    X_test.tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True,
    truncate_dim=384
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model.save('../data/processed/text/mpnet_enoder')

In [None]:
# Initialize classifiers
classifiers_transformer_mpnet = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}
for name, clf in classifiers_transformer_mpnet.items():
    print(f"\n{name} Evaluation:")

    # Train model
    clf.fit(X_train_vectors, y_train)

    # Predictions
    y_pred = clf.predict(X_test_vectors)

    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vectors, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


 Logistic Regression Evaluation:
Accuracy: 0.7350
Precision: 0.7365
Recall: 0.7350
F1-Score: 0.7347
Cross-val Accuracy: 0.7438 (±0.0119)

Gaussian NB Evaluation:
Accuracy: 0.7300
Precision: 0.7274
Recall: 0.7300
F1-Score: 0.7283
Cross-val Accuracy: 0.7125 (±0.0112)

Random Forest Evaluation:
Accuracy: 0.6900
Precision: 0.6952
Recall: 0.6900
F1-Score: 0.6923
Cross-val Accuracy: 0.6963 (±0.0135)

Decision Tree Evaluation:
Accuracy: 0.5600
Precision: 0.6054
Recall: 0.5600
F1-Score: 0.5670
Cross-val Accuracy: 0.4813 (±0.0331)



In [None]:
from sklearn.pipeline import make_pipeline

X_train_vectors = X_train_vectors.astype(np.float32)
X_test_vectors = X_test_vectors.astype(np.float32)

pipeline = make_pipeline(
    StandardScaler(),  # MPNet embeddings benefit from scaling
    LogisticRegression(
        max_iter=1000,
        C=0.1,
        class_weight='balanced',
        solver='saga',
        penalty='l2',
        random_state=42
    )
)

pipeline.fit(X_train_vectors, y_train)
y_pred = pipeline.predict(X_test_vectors)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Accuracy: 0.7500
F1: 0.7567 


Trying some neural network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model

X_train = X_train_vectors.astype(np.float32)
y_train = y_train.astype(np.int32)
X_test = X_test_vectors.astype(np.float32)
y_test = y_test.astype(np.int32)

# Model Architecture
def create_model():
    inputs = layers.Input(shape=(768,))

    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)

    x = layers.Dense(4)(x)
    outputs = layers.Softmax(axis=-1)(x)

    return Model(inputs=inputs, outputs=outputs)

model = create_model()

# Model Configuration 
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# Training
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5)
    ]
)

# Evaluation
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc:.4f}")

# Predictions
y_pred = model.predict(X_test).argmax(axis=1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step - accuracy: 0.4174 - loss: 1.3310 - val_accuracy: 0.7188 - val_loss: 1.0730
Epoch 2/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step - accuracy: 0.6829 - loss: 0.9638 - val_accuracy: 0.7563 - val_loss: 0.7532
Epoch 3/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 46ms/step - accuracy: 0.7745 - loss: 0.6451 - val_accuracy: 0.7375 - val_loss: 0.6939
Epoch 4/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step - accuracy: 0.7723 - loss: 0.5562 - val_accuracy: 0.7312 - val_loss: 0.6899
Epoch 5/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step - accuracy: 0.7940 - loss: 0.4975 - val_accuracy: 0.7437 - val_loss: 0.6823
Epoch 6/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 44ms/step - accuracy: 0.8492 - loss: 0.4005 - val_accuracy: 0.7437 - val_loss: 0.7125
Epoch 7/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 43ms/step - accuracy: 0.9133 - loss: 0.2758 - val_accuracy: 0.7375 - val_loss: 0.7566
Epoch 8/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 44ms/step - accuracy: 0.9000 - loss: 0.2678 - val_accuracy: 0.6938 - 

Using Bidirectional LSTM to better maintain the context

In [None]:
# Enhanced Model Architecture
def create_model():
    inputs = layers.Input(shape=(768,))
    # Reshape for sequential processing (24 timesteps x 32 features)
    x = layers.Reshape((24, 32))(inputs)

    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Dropout(0.3)(x)

    # Multi-head Attention
    attn_output = layers.MultiHeadAttention(num_heads=4, key_dim=16)(x, x)
    x = layers.Add()([x, attn_output])
    x = layers.LayerNormalization()(x)

    # Global pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Final layers
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(4, activation='softmax')(x)

    return Model(inputs=inputs, outputs=outputs)

model = create_model()

# Model Configuration
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# Training
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5)
    ]
)

# Evaluation
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc:.4f}")

y_pred = model.predict(X_test).argmax(axis=1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 412ms/step - accuracy: 0.2689 - loss: 1.3786 - val_accuracy: 0.4375 - val_loss: 1.3386
Epoch 2/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 413ms/step - accuracy: 0.3883 - loss: 1.3423 - val_accuracy: 0.4812 - val_loss: 1.3079
Epoch 3/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 413ms/step - accuracy: 0.4043 - loss: 1.3025 - val_accuracy: 0.4875 - val_loss: 1.2707
Epoch 4/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 419ms/step - accuracy: 0.5353 - loss: 1.2625 - val_accuracy: 0.5063 - val_loss: 1.2328
Epoch 5/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 415ms/step - accuracy: 0.5098 - loss: 1.2142 - val_accuracy: 0.5000 - val_loss: 1.1801
Epoch 6/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 414ms/step - accuracy: 0.5146 - loss: 1.1762 - val_accuracy: 0.5188 - val_loss: 1.1278
Epoch 7/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 419ms/step - accuracy: 0.5542 - loss: 1.1016 - val_accuracy: 0.5500 - val_loss: 1.0931
Epoch 8/50
40/40 ━━━━━━━━━━━━━━━━━━━━ 17s 418ms/step - accuracy: 0.5950 - loss: 1.0660 - val_acc

Combing both LogisticRegression with Neural Network

In [None]:
from sklearn.ensemble import StackingClassifier

# 1. Feature Engineering Pipeline

# Convert sparse matrices to dense arrays
X_train_tfidf_dense = X_train_vector_sparse.toarray().astype(np.float32)
X_test_tfidf_dense = X_test_vector_sparse.toarray().astype(np.float32)

# Concatenate with MPNet embeddings
X_train_combined = np.concatenate([X_train_vectors, X_train_tfidf_dense], axis=1)
X_test_combined = np.concatenate([X_test_vectors, X_test_tfidf_dense], axis=1)

# 2. Hybrid Model Architecture (No Flaky Layers)
def create_robust_model(input_dim):
    inputs = layers.Input(shape=(input_dim,))

    # Feature attention gate
    attention = layers.Dense(input_dim, activation='sigmoid')(inputs)
    x = layers.Multiply()([inputs, attention])

    # Simple processing
    x = layers.Dense(256, activation='relu', kernel_regularizer='l2')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)

    outputs = layers.Dense(4, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Initialize with combined feature dimension
model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
model = create_robust_model(X_train_combined.shape[1])
model.summary()

# 3. Stacked Training Approach


# First train neural model
print("Training neural model...")
model.fit(
    X_train_combined,
    y_train,
    epochs=30,
    batch_size=16,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)]
)

# Extract penultimate layer features
feature_extractor = tf.keras.Model(
    inputs=model.input,
    outputs=model.layers[-2].output
)
train_features = feature_extractor.predict(X_train_combined)
test_features = feature_extractor.predict(X_test_combined)

# Final stacking with Logistic Regression
print("\nTraining stacked classifier...")
stacked_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=0.1, max_iter=1000, class_weight='balanced')
)
stacked_clf.fit(train_features, y_train)

# 4. Evaluation
final_acc = stacked_clf.score(test_features, y_test)
print(f"\nFinal Stacked Accuracy: {final_acc:.4f}")

# Compare with raw neural model
nn_pred = model.predict(X_test_combined).argmax(1)
nn_acc = np.mean(nn_pred == y_test)
print(f"Neural Model Accuracy: {nn_acc:.4f}")


Training neural model...
Epoch 1/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 11s 106ms/step - accuracy: 0.3084 - loss: 5.6603 - val_accuracy: 0.4625 - val_loss: 3.7622
Epoch 2/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 5s 86ms/step - accuracy: 0.4914 - loss: 3.3580 - val_accuracy: 0.6750 - val_loss: 2.4273
Epoch 3/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 5s 87ms/step - accuracy: 0.6771 - loss: 2.2349 - val_accuracy: 0.6938 - val_loss: 1.8065
Epoch 4/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 3s 87ms/step - accuracy: 0.7036 - loss: 1.7105 - val_accuracy: 0.7063 - val_loss: 1.5212
Epoch 5/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 5s 86ms/step - accuracy: 0.7810 - loss: 1.4568 - val_accuracy: 0.7063 - val_loss: 1.3799
Epoch 6/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 3s 87ms/step - accuracy: 0.7896 - loss: 1.3323 - val_accuracy: 0.7250 - val_loss: 1.2939
Epoch 7/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 5s 87ms/step - accuracy: 0.7570 - loss: 1.2281 - val_accuracy: 0.7125 - val_loss: 1.2265
Epoch 8/30
40/40 ━━━━━━━━━━━━━━━━━━━━ 5s 86ms/step - accuracy: 0.7879 - loss: 1.142

In [32]:
X_train_combined.shape[1]

41866

In [None]:
y_pred = stacked_clf.predict(test_features)

print("accuracy_score:",accuracy_score(y_test, y_pred))
print("precision_score:",precision_score(y_test, y_pred,average='weighted'))
print("recall_score:",recall_score(y_test, y_pred,average='weighted'))
print("f1_score:",f1_score(y_test, y_pred,average='weighted'))

accuracy_score: 0.745
precision_score: 0.7484458370170056
recall_score: 0.745
f1_score: 0.7452182461866595 


Augmented data for better training

In [None]:
import random
from nltk.corpus import wordnet

def simple_augment(text, num_augments=2):
    """Custom text augmentation using synonym replacement + random deletion"""
    augmented = []
    words = nltk.word_tokenize(text)

    for _ in range(num_augments):
        # Synonym replacement (40% of words)
        mod_words = words.copy()
        for i in range(len(mod_words)):
            if random.random() < 0.4:
                synonyms = wordnet.synsets(mod_words[i])
                if synonyms:
                    new_word = synonyms[0].lemmas()[0].name()
                    mod_words[i] = new_word

        # Random deletion (20% of words)
        mod_words = [w for w in mod_words if random.random() > 0.2]

        augmented.append(' '.join(mod_words))

    return augmented

original_text = "A sci-fi movie about space exploration with alien encounters"
print("Original:", original_text)
print("Augmented:", simple_augment(original_text))

all_augmented = []
for desc in df['description']:
    all_augmented.extend(simple_augment(desc))

from sentence_transformers import SentenceTransformer
model_transformer = SentenceTransformer('all-mpnet-base-v2')

original_embeddings = model_transformer.encode(df['description'].tolist())
augmented_embeddings = model_transformer.encode(all_augmented)


Original: A sci-fi movie about space exploration with alien encounters
Augmented: ['A movie about space with foreigner encounters', 'angstrom sci-fi movie space with']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
 



In [None]:
X_train = np.vstack([original_embeddings, augmented_embeddings])
y_train = np.concatenate([df['genre_encoded'],
                        np.repeat(df['genre_encoded'], 2)])  # 2 augmentations per sample

print(f"Training size increased from {len(df)} to {len(X_train)}")

# Now train your model
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(768,)),
        tf.keras.layers.Dropout(0.5),
        #tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

model = create_model()
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=8,
    validation_split=0.25,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)]
)

# Evaluate
test_loss, test_acc = model.evaluate(X_test_vectors, y_test)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Training size increased from 1000 to 3000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Final Test Accuracy: 0.8500 


In [None]:
y_pred=model.predict(X_test_vectors)



In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91        55
           1       0.68      0.80      0.74        40
           2       0.86      0.90      0.88        60
           3       0.84      0.82      0.83        45

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.86      0.84      0.85       200 


Accuracy improved but

Precision of first class is low

In [None]:
# Class Mapping
CLASSES = {
    0: "action", 
    1: "comedy",
    2: "drama",
    3: "horror"
}

# Reverse mapping for name->index
CLASS_INDICES = {v:k for k,v in CLASSES.items()}


problem_class_name = "comedy" 
class1_index = CLASS_INDICES[problem_class_name]

# Verify
print(f"Augmenting data for class: {problem_class_name} (index {class1_index})")

# 2. Class-Specific Augmentation 
# Get original class counts
original_counts = np.bincount(y_train)
print(f"\nOriginal class distribution: {original_counts}")

# Filter DataFrame for problem class
class1_mask = df['genre'] == problem_class_name
class1_descs = df[class1_mask]['description'].tolist()

# Generate 3x augmented versions
augmented_class1 = []
for text in class1_descs:  # 750+250*3
    augmented = simple_augment(text, num_augments=3)  # Using our previous augmentation function
    augmented_class1.extend(augmented)

# Encode augmented texts
X_train_class1_aug = model_transformer.encode(augmented_class1) 
y_train_class1_aug = np.full(len(augmented_class1), class1_index)

# Combine datasets
X_train_enhanced = np.vstack([X_train, X_train_class1_aug])
y_train_enhanced = np.concatenate([y_train, y_train_class1_aug])

# Verify new distribution
new_counts = np.bincount(y_train_enhanced)
print(f"New class distribution: {new_counts}")
print(f"Added {len(augmented_class1)} samples for {problem_class_name}")

Augmenting data for class: comedy (index 1)

Original class distribution: [750 750 750 750]
New class distribution: [ 750 1500  750  750]
Added 750 samples for comedy 


In [None]:
# Training with Class Weights 

# Calculate class weights
class_counts = np.bincount(y_train_enhanced)
total_samples = len(y_train_enhanced)
class_weights = total_samples / (len(class_counts) * class_counts)

# Convert to dictionary format
weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Build model
model = create_model() 
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

# Train with weights
history = model.fit(
    X_train_enhanced,
    y_train_enhanced,
    class_weight=weight_dict,
    epochs=50,
    batch_size=8,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
y_pred=model.predict(X_test_vectors)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        60
           3       1.00      1.00      1.00        45

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200 


To avoid overfitting, reducing the epochs

In [None]:
# Train lower epochs to avoid overfitting
history = model.fit(
    X_train_enhanced,
    y_train_enhanced,
    class_weight=weight_dict,
    epochs=5,
    batch_size=8,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
y_pred=model.predict(X_test_vectors)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90        55
           1       0.74      0.42      0.54        40
           2       0.84      0.97      0.90        60
           3       0.72      0.87      0.79        45

    accuracy                           0.81       200
   macro avg       0.80      0.79      0.78       200
weighted avg       0.81      0.81      0.80       200
 


recall for class 1 is less

In [None]:
class_weights = {0:1, 1:2.5, 2:1, 3:1} #Updated class weights
history = model.fit(
    X_train_enhanced,
    y_train_enhanced,
    epochs=5,
    batch_size=8,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)],
    class_weight=class_weights
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
y_pred=model.predict(X_test_vectors)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        55
           1       0.70      0.95      0.81        40
           2       0.92      0.92      0.92        60
           3       0.97      0.76      0.85        45

    accuracy                           0.89       200
   macro avg       0.90      0.89      0.88       200
weighted avg       0.91      0.89      0.89       200 


Well balanced model

In [None]:
import joblib
joblib.dump(LE, '../data/processed/text/label_encoder.pkl')

In [None]:
model.save('../data/processed/text/final_model')