In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

print("Libraries imported ✅")

Libraries imported ✅


In [2]:
df = pd.read_csv("train_data.txt", sep=":::", engine="python", names=["id", "title", "genre", "plot"])
df.head()

Unnamed: 0,id,title,genre,plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [3]:
df.shape
df['genre'].value_counts().head(10)

genre
 drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
Name: count, dtype: int64

In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['clean_plot'] = df['plot'].apply(clean_text)
df[['plot', 'clean_plot']].head()

Unnamed: 0,plot,clean_plot
0,Listening in to a conversation between his do...,listening in to a conversation between his do...
1,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous r...
2,As the bus empties the students for their fie...,as the bus empties the students for their fie...
3,To help their unemployed father make ends mee...,to help their unemployed father make ends mee...
4,The film's title refers not only to the un-re...,the films title refers not only to the unreco...


In [5]:
X = df['clean_plot']
y = df['genre']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

len(X_train), len(X_test)

(43371, 10843)

In [6]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((43371, 5000), (10843, 5000))

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5782532509453103


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

      action        0.51      0.25      0.34       263
       adult        0.81      0.29      0.43       118
   adventure        0.71      0.14      0.24       155
   animation        0.55      0.11      0.18       100
   biography        0.00      0.00      0.00        53
      comedy        0.52      0.60      0.56      1490
       crime        0.17      0.01      0.02       101
 documentary        0.66      0.85      0.74      2619
       drama        0.54      0.76      0.63      2723
      family        0.57      0.10      0.17       157
     fantasy        0.00      0.00      0.00        65
   game-show        1.00      0.41      0.58        39
     history        0.00      0.00      0.00        49
      horror        0.65      0.55      0.60       441
       music        0.69      0.40      0.50       146
     musical        0.50      0.04      0.07        55
     mystery        0.50      0.02      0.03        64
        n

  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import pickle

with open("models/genre_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("Model and vectorizer saved ✅")

Model and vectorizer saved ✅


In [9]:
sample_plot = ["A young man struggles to survive in a world full of crime and corruption."]
sample_plot_clean = [clean_text(sample_plot[0])]
sample_vec = tfidf.transform(sample_plot_clean)

model.predict(sample_vec)

array([' drama '], dtype=object)