In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
movie_df = pd.read_csv("/content/movies.csv")
movie_df.head(10)

Unnamed: 0,title,overview,genre
0,Ariel,After the coal mine he works at closes and his...,"['Drama', 'Comedy', 'Romance', 'Crime']"
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","['Drama', 'Comedy', 'Romance']"
2,Four Rooms,It's Ted the Bellhop's first night on the job....,['Comedy']
3,Judgment Night,"Four young friends, while taking a shortcut en...","['Action', 'Crime', 'Thriller']"
4,Star Wars,Princess Leia is captured and held hostage by ...,"['Adventure', 'Action', 'Science Fiction']"
5,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...","['Animation', 'Family']"
6,Forrest Gump,A man with a low IQ has accomplished great thi...,"['Comedy', 'Drama', 'Romance']"
7,American Beauty,"Lester Burnham, a depressed suburban father in...",['Drama']
8,Citizen Kane,Newspaper magnate Charles Foster Kane is taken...,"['Mystery', 'Drama']"
9,Dancer in the Dark,"Selma, a Czech immigrant on the verge of blind...","['Drama', 'Crime']"


In [3]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9466 entries, 0 to 9465
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     9466 non-null   object
 1   overview  9465 non-null   object
 2   genre     9466 non-null   object
dtypes: object(3)
memory usage: 222.0+ KB


In [4]:
movie_df.isnull().sum()

Unnamed: 0,0
title,0
overview,1
genre,0


In [5]:
movie_df.dropna(inplace = True)

In [6]:
movie_df.isnull().sum()

Unnamed: 0,0
title,0
overview,0
genre,0


In [7]:
movie_df.rename(columns={
    'overview': 'plot',
}, inplace=True)

In [8]:
movie_df.head(20)

Unnamed: 0,title,plot,genre
0,Ariel,After the coal mine he works at closes and his...,"['Drama', 'Comedy', 'Romance', 'Crime']"
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","['Drama', 'Comedy', 'Romance']"
2,Four Rooms,It's Ted the Bellhop's first night on the job....,['Comedy']
3,Judgment Night,"Four young friends, while taking a shortcut en...","['Action', 'Crime', 'Thriller']"
4,Star Wars,Princess Leia is captured and held hostage by ...,"['Adventure', 'Action', 'Science Fiction']"
5,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...","['Animation', 'Family']"
6,Forrest Gump,A man with a low IQ has accomplished great thi...,"['Comedy', 'Drama', 'Romance']"
7,American Beauty,"Lester Burnham, a depressed suburban father in...",['Drama']
8,Citizen Kane,Newspaper magnate Charles Foster Kane is taken...,"['Mystery', 'Drama']"
9,Dancer in the Dark,"Selma, a Czech immigrant on the verge of blind...","['Drama', 'Crime']"


In [9]:
movie_df.dropna(subset=['genre', 'plot'], inplace=True)


In [10]:
movie_df.shape

(9465, 3)

In [11]:
def clean_text(text):
    text = text.lower()                                      # lowercase
    text = re.sub(r'\[.*?\]', '', text)                      # remove brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)        # remove URLs
    text = re.sub(r'<.*?>+', '', text)                       # remove HTML
    text = text.translate(str.maketrans('','', string.punctuation)) # remove punctuation
    text = re.sub(r'\n', '', text)                           # remove newline
    text = re.sub(r'\w*\d\w*', '', text)                     # remove digits
    text = ' '.join(word for word in text.split() if word not in stop_words)  # remove stopwords
    return text

movie_df['clean_plot'] = movie_df['plot'].apply(clean_text)


In [12]:
movie_df

Unnamed: 0,title,plot,genre,clean_plot
0,Ariel,After the coal mine he works at closes and his...,"['Drama', 'Comedy', 'Romance', 'Crime']",coal mine works closes father commits suicide ...
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","['Drama', 'Comedy', 'Romance']",nikander rubbish collector wouldbe entrepreneu...
2,Four Rooms,It's Ted the Bellhop's first night on the job....,['Comedy'],ted bellhops first night joband hotels unusual...
3,Judgment Night,"Four young friends, while taking a shortcut en...","['Action', 'Crime', 'Thriller']",four young friends taking shortcut en route lo...
4,Star Wars,Princess Leia is captured and held hostage by ...,"['Adventure', 'Action', 'Science Fiction']",princess leia captured held hostage evil imper...
...,...,...,...,...
9461,TAYLOR SWIFT | THE ERAS TOUR,The cultural phenomenon continues on the big s...,['Music'],cultural phenomenon continues big screen immer...
9462,The Rat Catcher,"In an English village, a reporter and a mechan...",['Comedy'],english village reporter mechanic listen ratca...
9463,The Swan,"Two large, ignorant bullies ruthlessly pursue ...",['Drama'],two large ignorant bullies ruthlessly pursue s...
9464,Poison,When a poisonous snake slithers onto an Englis...,['Comedy'],poisonous snake slithers onto englishmans stom...


In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
movie_df['genre_label'] = le.fit_transform(movie_df['genre'])

label_names = dict(zip(le.transform(le.classes_), le.classes_))


In [14]:
X = movie_df['clean_plot']
y = movie_df['genre_label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 1)

In [16]:
Vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1, 3))
X_train_vec = Vectorizer.fit_transform(X_train)
X_test_vec = Vectorizer.transform(X_test)

In [17]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X_train_vec, y_train)

In [18]:
y_pred = svc.predict(X_test_vec)

In [19]:
print("Accuracy of the model :", accuracy_score(y_test, y_pred))

Accuracy of the model : 0.0919175911251981


In [20]:
movie_df.head(1)

Unnamed: 0,title,plot,genre,clean_plot,genre_label
0,Ariel,After the coal mine he works at closes and his...,"['Drama', 'Comedy', 'Romance', 'Crime']",coal mine works closes father commits suicide ...,1065


In [21]:
movie_df['plot'][0]

'After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.'

In [22]:
plot = 'After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.'
vec = Vectorizer.transform([plot])
predicted_genre_code = svc.predict(vec)
predicted_genre = le.inverse_transform([predicted_genre_code])
print("The predicted genre is : ", predicted_genre[0])

The predicted genre is :  ['Drama', 'Comedy', 'Romance', 'Crime']


  y = column_or_1d(y, warn=True)


In [23]:
def predict_genre(plot):
    # Step 1: Clean the input plot
    cleaned_plot = clean_text(plot)

    # Step 2: Vectorize the cleaned plot
    vectorized_plot = Vectorizer.transform([cleaned_plot])

    # Step 3: Predict encoded genre using the model
    predicted_label = svc.predict(vectorized_plot)[0]

    # Step 4: Convert encoded label back to genre name
    genre_name = le.inverse_transform([predicted_label])[0]

    return genre_name


In [24]:
new_plot = "A computer hacker named Neo discovers that the world he lives in is a simulated reality called the Matrix. With the help of mysterious rebels, he learns the truth and joins the fight to free humanity from an oppressive AI system that controls the simulated world."

predicted = predict_genre(new_plot)
print("Predicted Genre:", predicted)


Predicted Genre: ['Action', 'Adventure', 'Science Fiction']
