## Project DetailsAim -
#### Predict the genre of a movie based on its plot summary and other features.
#### Description - Use natural language processing (NLP) techniques for text classification on a movie dataset.
#### Technologies - Python, NLTK or SpaCy, Scikit-learn.


In [9]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO

import warnings
warnings.filterwarnings("ignore")
# Download required resources
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Step 2: Create Sample Dataset (Movie Plot & Genre)
data = StringIO("""
plot,genre
"A young boy discovers he is a wizard and attends a magical school.",Fantasy
"A team of astronauts travel through a wormhole in space.",Sci-Fi
"A detective investigates a series of murders in a gloomy city.",Thriller
"A couple falls in love during a war and struggle to stay together.",Romance
"A hilarious story of a man who gets stuck in an elevator with his boss.",Comedy
"A haunted house causes strange and terrifying events.",Horror
"Robots take over the earth and a group of survivors fights back.",Action
"A girl with cancer finds love and meaning in life.",Drama
""")

df = pd.read_csv(data)
df.head()


Unnamed: 0,plot,genre
0,A young boy discovers he is a wizard and atten...,Fantasy
1,A team of astronauts travel through a wormhole...,Sci-Fi
2,A detective investigates a series of murders i...,Thriller
3,A couple falls in love during a war and strugg...,Romance
4,A hilarious story of a man who gets stuck in a...,Comedy


In [11]:
# Step 3: Preprocess Text
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess(text):
    words = text.lower().split()
    words = [stemmer.stem(w) for w in words if w.isalpha() and w not in stop_words]
    return " ".join(words)

df['cleaned_plot'] = df['plot'].apply(preprocess)
df['cleaned_plot']

0     young boy discov wizard attend magic
1            team astronaut travel wormhol
2       detect investig seri murder gloomi
3         coupl fall love war struggl stay
4          hilari stori man get stuck elev
5           haunt hous caus strang terrifi
6    robot take earth group survivor fight
7               girl cancer find love mean
Name: cleaned_plot, dtype: object

In [12]:
# Step 4: Vectorization and Label Encoding
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_plot'])

# Convert genres to numeric labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['genre'])
y

array([3, 6, 7, 5, 1, 4, 0, 2])

In [13]:
# Step 5: Train-Test Split & Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1])

In [14]:
import numpy as np

# Get only labels present in y_test
present_labels = np.unique(y_test)
present_names = le.inverse_transform(present_labels)

print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=present_labels, target_names=present_names))



Classification Report:
               precision    recall  f1-score   support

     Fantasy       0.00      0.00      0.00       1.0
      Horror       0.00      0.00      0.00       1.0
      Sci-Fi       0.00      0.00      0.00       1.0

   micro avg       0.00      0.00      0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0

