In [1]:
import pandas as pd
df=pd.read_csv('../data/raw/IMDB_four_genre_larger_plot_description.csv')
df.drop('movie_id',axis=1,inplace=True)
df.head()

Unnamed: 0,description,genre
0,Elle Evans (Joey King) has finally completed h...,romance
1,A young girl tries to understand how she myste...,horror
2,"In 1800s England, a well meaning but selfish y...",comedy
3,Abby Holland (Kristen Stewart) and Harper Cald...,romance
4,Olga and Maks are 15 years apart. She is a suc...,romance


In [2]:
Description=df['description']
Description

0      Elle Evans (Joey King) has finally completed h...
1      A young girl tries to understand how she myste...
2      In 1800s England, a well meaning but selfish y...
3      Abby Holland (Kristen Stewart) and Harper Cald...
4      Olga and Maks are 15 years apart. She is a suc...
                             ...                        
995    In front of their little boy, Camille and Geor...
996    After losing his wife and his memory in a car ...
997    Based on the true-life experiences of Dave Fis...
998    A troupe of hilariously self-obsessed theater ...
999    A young mermaid makes a deal with a sea witch ...
Name: description, Length: 1000, dtype: object

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [None]:
nltk.download(['punkt', 'wordnet', 'stopwords'])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\testr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
stop_words=stopwords.words('english')

In [6]:
WNL=WordNetLemmatizer()
def preprocess_text(text: str,return_lst=True) -> list:
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text=text.lower()
    tokens=word_tokenize(text)
    lst=[]
    for token in tokens:
        if token not in stop_words:
            token=WNL.lemmatize(token)
            lst.append(token)
    if return_lst:
        return lst
    else:
        return ' '.join(lst)
processed=Description.apply(preprocess_text,return_lst=False)

In [7]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
target=LE.fit_transform(df['genre'])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(processed,target,test_size=0.2,random_state=42)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf=TfidfVectorizer()
X_train_vector_sparse=Tfidf.fit_transform(X_train)
X_test_vector_sparse=Tfidf.transform(X_test)

In [10]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

classifiers_tfidf = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial NB": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}

# Train and evaluate each classifier
for name, clf in classifiers_tfidf.items():
    print(f"\n{name} Evaluation:")
    
    # Train model
    clf.fit(X_train_vector_sparse, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test_vector_sparse)
    
    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vector_sparse, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


Logistic Regression Evaluation:
Accuracy: 0.7100
Precision: 0.7059
Recall: 0.7100
F1-Score: 0.7070
Cross-val Accuracy: 0.7125 (±0.0319)

Multinomial NB Evaluation:
Accuracy: 0.6950
Precision: 0.7275
Recall: 0.6950
F1-Score: 0.7060
Cross-val Accuracy: 0.6825 (±0.0214)

Random Forest Evaluation:
Accuracy: 0.7200
Precision: 0.7191
Recall: 0.7200
F1-Score: 0.7176
Cross-val Accuracy: 0.6750 (±0.0256)

Decision Tree Evaluation:
Accuracy: 0.4900
Precision: 0.4930
Recall: 0.4900
F1-Score: 0.4724
Cross-val Accuracy: 0.4650 (±0.0357)


# Word2vec

In [12]:
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

In [45]:
from gensim.models import Word2Vec
model_w2v = Word2Vec(
    sentences=X_train_tokens,
    window=5,
    sg=1
)

In [46]:
import numpy as np
def document_vector(doc_tokens: list, model: 'model') ->'embedding':
    vectors = [model.wv[word] for word in doc_tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_vectors = np.array([document_vector(doc, model_w2v) for doc in X_train_tokens])
X_test_vectors = np.array([document_vector(doc, model_w2v) for doc in X_test_tokens])

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}
for name, clf in classifiers.items():
    print(f"\n{name} Evaluation:")
    
    # Train model
    clf.fit(X_train_vectors, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test_vectors)
    
    # Calculate metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train_vectors, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-val Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


Logistic Regression Evaluation:
Accuracy: 0.5150
Precision: 0.5199
Recall: 0.5150
F1-Score: 0.5018
Cross-val Accuracy: 0.5150 (±0.0346)

Gaussian NB Evaluation:
Accuracy: 0.5000
Precision: 0.5097
Recall: 0.5000
F1-Score: 0.4965
Cross-val Accuracy: 0.5100 (±0.0414)

Random Forest Evaluation:
Accuracy: 0.5500
Precision: 0.5642
Recall: 0.5500
F1-Score: 0.5546
Cross-val Accuracy: 0.5800 (±0.0378)

Decision Tree Evaluation:
Accuracy: 0.5100
Precision: 0.5247
Recall: 0.5100
F1-Score: 0.5151
Cross-val Accuracy: 0.5075 (±0.0419)


isnt capturing long term dependencies

# testing on query

<h3> Tf-Idf

In [36]:
john_wick='With the price on his head ever increasing, legendary hit man John Wick takes his fight against the High Table global as he seeks out the most powerful players in the underworld, from New York to Paris to Japan to Berlin.'
text=preprocess_text(john_wick,return_lst=False)

In [37]:
text_tfidf=Tfidf.transform([text])

In [38]:
classifiers_tfidf['Logistic Regression'].predict(text_tfidf)

array([0])

In [39]:
LE.inverse_transform([0])

array(['action'], dtype=object)

<h3> W2V

In [43]:
embedding_w2v=document_vector(john_wick,model_w2v)
classifiers['Logistic Regression'].predict([embedding_w2v])

array([0])

In [44]:
LE.inverse_transform([0])

array(['action'], dtype=object)