## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##  Load Dataset

In [4]:
# Load dataset (replace 'movie_data.csv' with your dataset file)
data = pd.read_csv("C:/Users/abhil/Downloads/movie_data.csv")
print(data.head())

# Assuming dataset has columns: 'Plot' and 'Genre'
X = data['Plot']
y = data['Genre']


                                                Plot      Genre
0  A young boy discovers he has magical powers an...    Fantasy
1  A group of friends go on an adventure to destr...  Adventure
2  A detective solves crimes in a futuristic city...     Sci-Fi
3  A couple falls in love despite their families'...    Romance
4  A hero fights to save his city from a gang of ...     Action


## Text Preprocessing

In [5]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing
X = X.apply(preprocess_text)


##  Text Vectorization

In [7]:
# Convert text into numerical format using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X).toarray()


## Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


##  Model Training

In [9]:
# Train a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


##  Prediction and Evaluation

In [11]:
# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

   Adventure       0.00      0.00      0.00       1.0
       Drama       0.00      0.00      0.00       1.0
      Sci-Fi       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## User Input Prediction

In [12]:
def predict_genre(plot_summary):
    processed_summary = preprocess_text(plot_summary)
    summary_tfidf = tfidf.transform([processed_summary])
    prediction = model.predict(summary_tfidf)
    return prediction[0]

# Test the function
plot = "A young boy discovers he has magical powers and attends a school of witchcraft."
print("Predicted Genre:", predict_genre(plot))


Predicted Genre: Fantasy


## Learning outcomes

*Text Preprocessing:* Clean and prepare textual data for machine learning models.
                                                    
*Feature Engineering:* Transform text into numerical features using TF-IDF.

*Classification Models:* Train and evaluate text classification models.

*Practical Application:* Build a predictive system for real-world use cases.
