In [7]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Load dataset
movies_df = pd.read_csv('imdb_movies.csv')
print(movies_df.head())
print(movies_df.info())

# Check missing values
print(movies_df.isnull().sum())


                         names       date_x  score  \
0                    Creed III  03/02/2023    73.0   
1     Avatar: The Way of Water  12/15/2022    78.0   
2  The Super Mario Bros. Movie  04/05/2023    76.0   
3                      Mummies  01/05/2023    70.0   
4                    Supercell  03/17/2023    61.0   

                                           genre  \
0                                  Drama, Action   
1             Science Fiction, Adventure, Action   
2  Animation, Adventure, Family, Fantasy, Comedy   
3  Animation, Comedy, Family, Adventure, Fantasy   
4                                         Action   

                                            overview  \
0  After dominating the boxing world, Adonis Cree...   
1  Set more than a decade after the events of the...   
2  While working underground to fix a water main,...   
3  Through a series of unfortunate events, three ...   
4  Good-hearted teenager William always lived in ...   

                         

In [9]:
# Data preprocessing
 
# Droping rows with missing overview or genre
movies_df = movies_df.dropna(subset=['overview', 'genre'])

# Reset index
movies_df.reset_index(drop=True, inplace=True)

# Combining multiple genres into single 
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.replace(', ', ' '))

# Text cleaning function
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lower case and strip whitespaces
    return text

# Apply text cleaning
movies_df['overview'] = movies_df['overview'].apply(clean_text)

# Stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
movies_df['overview'] = movies_df['overview'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


In [10]:
# Splitting dataset

# Features and labels

X = movies_df['overview']
y = movies_df['genre']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Training model

# Initialize and train the classifier
classifier = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))
])

classifier.fit(X_train, y_train)


In [13]:
# Model building and evaluation

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.08816245666171373
Classification Report:
                                                                                        precision    recall  f1-score   support

                                                                                Action       0.00      0.00      0.00        15
                                                                     Action, Adventure       0.00      0.00      0.00         6
                                                          Action, Adventure, Animation       0.00      0.00      0.00         1
                                          Action, Adventure, Animation, Comedy, Family       0.00      0.00      0.00         1
                                                   Action, Adventure, Animation, Drama       0.00      0.00      0.00         1
                                  Action, Adventure, Animation, Drama, Family, Fantasy       0.00      0.00      0.00         1
                                Action, Adventure,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
