# Task 1 - MOVIE GENRE CLASSIFICATION

In [1]:
"""
Create a machine learning model that can predict the genre of a movie 
based on its plot summary or other textual information. 
You can use techniques like TF-IDF or word embeddings with classifiers such as 
Naive Bayes, Logistic Regression, or Support Vector Machines.
"""

'\nCreate a machine learning model that can predict the genre of a movie \nbased on its plot summary or other textual information. \nYou can use techniques like TF-IDF or word embeddings with classifiers such as \nNaive Bayes, Logistic Regression, or Support Vector Machines.\n'

In [26]:
import os
os.listdir()

['.gitkeep',
 'description.txt',
 'Task_1.ipynb',
 'test_data.txt',
 'test_data_solution.txt',
 'train_data.txt']

# 1. Read or Extract data

In [1]:
import pandas as pd                       # Importing pandas package for handling Dataframe 
import warnings                           # Importing warnings package for ignoring warning messages
warnings.filterwarnings("ignore")
data=pd.read_csv("description.txt")       # Read the file as csv
data                                      # Displaying Data

Unnamed: 0,Train data:
0,ID ::: TITLE ::: GENRE ::: DESCRIPTION
1,ID ::: TITLE ::: GENRE ::: DESCRIPTION
2,ID ::: TITLE ::: GENRE ::: DESCRIPTION
3,ID ::: TITLE ::: GENRE ::: DESCRIPTION
4,Test data:
5,ID ::: TITLE ::: DESCRIPTION
6,ID ::: TITLE ::: DESCRIPTION
7,ID ::: TITLE ::: DESCRIPTION
8,ID ::: TITLE ::: DESCRIPTION
9,Source:


# 2. Create and apply Function to read data by splitting by ":::"

In [2]:
def load_data(file_path):                                   # Function that takes file path
    with open(file_path, 'r', encoding='utf-8') as f:       # To read the data in read mode with utf-8 encoding as reference f 
        data = f.readlines()                                # Take each line and then
    data = [line.strip().split(' ::: ') for line in data]   # Split each line by " ::: " to get "ID ::: TITLE ::: GENRE ::: DESCRIPTION" format
    return data                                             # Return the splitted line

In [3]:

train_data = load_data("train_data.txt")                                             # To Load training dataset and pass to load_data function
train_df = pd.DataFrame(train_data, columns=['ID', 'Title', 'Genre', 'Description']) # Convert to Dataframe and rename columns with proper column names

test_data = load_data("test_data.txt")                                   # To Load test dataset and pass to load_data function
test_df = pd.DataFrame(test_data, columns=['ID', 'Title', 'Description'])# Convert to Dataframe and rename columns with proper column names

test_solution = load_data('test_data_solution.txt')                                             # Load test data solution (test_solution has 3 columns: ID, Title, Genre)
test_solution_df = pd.DataFrame(test_solution, columns=['ID', 'Title', 'Genre', 'Description']) # Solution has 'Genre' column


In [5]:
print("Train Data:")
train_df  # Should have 4 columns: ID, Title, Genre, Description

Train Data:


Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...


In [6]:
print("\nTest Data:")
test_df   # Should have 3 columns: ID, Title, Description


Test Data:


Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [7]:
print("\nTest Solution:")
test_solution_df  # Should have 3 columns: ID, Title, Genre


Test Solution:


Unnamed: 0,ID,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


# 3. Feature extraction : TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer         # Importing TfidfVectorizer from sklearn
vectorizer = TfidfVectorizer(max_features=10000)                    # Initialize the TF-IDF Vectorizer with 10k features

X_train_tfidf = vectorizer.fit_transform(train_df["Description"])   # Fit and transform the training dataframe column named "Description"
X_test_tfidf = vectorizer.transform(test_df["Description"])         # Fit and transform the test dataframe column named "Description"

print(f"Training data shape: {X_train_tfidf.shape}")                # Display the shape of the training dataset
print(f"Test data shape: {X_test_tfidf.shape}")                     # Display the shape of the testing dataset

Training data shape: (54214, 10000)
Test data shape: (54200, 10000)


# 4. Encoding the Target labels

In [9]:
from sklearn.preprocessing import LabelEncoder                          # Importing label encoder from sklearn package
label_encoder = LabelEncoder()                                          # Encoding values into numeric labels
y_train = label_encoder.fit_transform(train_df['Genre'])                # Encoding or transforming dataframe column into numeric label
print(f"Unique genres in the training data: {label_encoder.classes_}")  # Check the unique genres after encoding


Unique genres in the training data: ['action' 'adult' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'game-show' 'history' 'horror'
 'music' 'musical' 'mystery' 'news' 'reality-tv' 'romance' 'sci-fi'
 'short' 'sport' 'talk-show' 'thriller' 'war' 'western']


# 5. Model Building - Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression         # Importing Logistic regression from sklearn
lr_model = LogisticRegression(max_iter=200)                 # Build a Logistic Regression with maximum number of iterations of 200
lr_model.fit(X_train_tfidf, y_train)                        # Traing the model with the training dataset

y_pred = lr_model.predict(X_test_tfidf)                     # Predict on the test dataset values
predicted_genres = label_encoder.inverse_transform(y_pred)  # Apply inverse transform to prdicted values

test_df['Predicted_Genre'] = predicted_genres               # Add predictions to the test dataframe
test_df[['Title', 'Predicted_Genre']]                       # Display the test dataframe with predictions


Unnamed: 0,Title,Predicted_Genre
0,Edgar's Lunch (1998),drama
1,La guerra de papá (1977),drama
2,Off the Beaten Track (2010),documentary
3,Meu Amigo Hindu (2015),drama
4,Er nu zhai (1955),drama
...,...,...
54195,"""Tales of Light & Dark"" (2013)",drama
54196,Der letzte Mohikaner (1965),drama
54197,Oliver Twink (2007),comedy
54198,Slipstream (1973),drama


In [11]:
test_df['Predicted_Genre'] = predicted_genres       # Adding predicted genres to the test DataFrame

merged_df = pd.merge(test_solution_df[['ID', 'Genre']], test_df[['ID', 'Predicted_Genre']], on='ID')    # Now, merge 'test_solution_df' with 'test_df' on 'ID' to compare the actual and predicted genres
merged_df                                           # Check the columns of merged_df to confirm 'Predicted_Genre' is present

Unnamed: 0,ID,Genre,Predicted_Genre
0,1,thriller,drama
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama
...,...,...,...
54195,54196,horror,drama
54196,54197,western,drama
54197,54198,adult,comedy
54198,54199,drama,drama


# 6. Model Evaluation - Logistic Regression

In [12]:
from sklearn.metrics import accuracy_score, classification_report               # Importing modules for metrics calculation

accuracy = accuracy_score(merged_df['Genre'], merged_df['Predicted_Genre'])     # Get Accuracy : Compare actual and predicted genres
print(f"Accuracy: {accuracy:.4f}")                                              # Print the accuracy upto 4 decimal points

print("\nClassification Report:")                                               # To get classification report
print(classification_report(merged_df['Genre'], merged_df['Predicted_Genre']))  # Display and print classification report


Accuracy: 0.5948

Classification Report:
              precision    recall  f1-score   support

      action       0.51      0.30      0.37      1314
       adult       0.65      0.24      0.36       590
   adventure       0.67      0.16      0.26       775
   animation       0.61      0.04      0.08       498
   biography       0.00      0.00      0.00       264
      comedy       0.54      0.60      0.57      7446
       crime       0.41      0.03      0.06       505
 documentary       0.68      0.87      0.76     13096
       drama       0.55      0.79      0.65     13612
      family       0.49      0.08      0.14       783
     fantasy       0.61      0.03      0.06       322
   game-show       0.90      0.49      0.64       193
     history       0.00      0.00      0.00       243
      horror       0.66      0.57      0.61      2204
       music       0.68      0.46      0.55       731
     musical       0.44      0.01      0.03       276
     mystery       0.25      0.00      0

# 7. Model Bulding - Navie Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB           # importing MultinomialNB from sklearn
nb_model = MultinomialNB()                              # Building the Naive Bayes model
nb_model.fit(X_train_tfidf, y_train)                    # Training the Naive Bayes model with training dataset

In [14]:
y_pred_nb = nb_model.predict(X_test_tfidf)                                                  # Predicting the genres on the test dataset
predicted_genres_nb = label_encoder.inverse_transform(y_pred_nb)                            # Applying inverse transform on predicted values
test_df['Predicted_Genre_NB'] = predicted_genres_nb                                         # Add predictions to the test dataframe
merged_df_nb = pd.merge(test_solution_df, test_df[['ID', 'Predicted_Genre_NB']], on='ID')   # Merging dataframe

# 8. Model Evsluation - Navie Bayes

In [15]:
from sklearn.metrics import accuracy_score, classification_report                           # Importing modules for evaluation metrics

accuracy_nb = accuracy_score(merged_df_nb['Genre'], merged_df_nb['Predicted_Genre_NB'])     # Finding Accuracy for Naive Bayes
print(f"Naive Bayes Accuracy: {accuracy_nb}")

print("Naive Bayes Classification Report:")                                                 #  Finding classification report for Naive Bayes
print(classification_report(merged_df_nb['Genre'], merged_df_nb['Predicted_Genre_NB'], target_names=label_encoder.classes_))


Naive Bayes Accuracy: 0.5092066420664206
Naive Bayes Classification Report:
              precision    recall  f1-score   support

      action       0.57      0.03      0.06      1314
       adult       0.46      0.02      0.04       590
   adventure       0.77      0.04      0.08       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.40      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.56      0.89      0.69     13096
       drama       0.44      0.84      0.58     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       1.00      0.02      0.04       193
     history       0.00      0.00      0.00       243
      horror       0.77      0.23      0.35      2204
       music       0.89      0.02      0.04       731
     musical       0.00      0.00      0.00       276
     

# 9. Model Building : SVM(Support Vector Machine)

In [16]:
from sklearn.svm import SVC             # Importing SVC module from sklearn
svm_model = SVC(kernel='linear')        # Building a Support Vector Machine (SVM) model with a linear kernel
svm_model.fit(X_train_tfidf, y_train)   # Training the SVM model with training data set

In [17]:
y_pred_svm = svm_model.predict(X_test_tfidf)                                                # Predicting the values on the test dataset
predicted_genres_svm = label_encoder.inverse_transform(y_pred_svm)                          # Applying inverse transform on predicted values
test_df['Predicted_Genre_SVM'] = predicted_genres_svm                                       # Adding predictions to the test dataframe
merged_df_svm = pd.merge(test_solution_df, test_df[['ID', 'Predicted_Genre_SVM']], on='ID') # Evaluating the model

# 10. Model Evaluation - SVM(Support Vector Machine)

In [18]:
from sklearn.metrics import accuracy_score, classification_report                                   # Importing modules for evaluation metrics

accuracy_svm = accuracy_score(merged_df_svm['Genre'], merged_df_svm['Predicted_Genre_SVM'])         # To get Accuracy for SVM model
print(f"SVM Accuracy: {accuracy_svm}")                                                              # Display accuracy value

print("SVM Classification Report:")                                                                 # To get classification report for SVM model
print(classification_report(merged_df_svm['Genre'], merged_df_svm['Predicted_Genre_SVM'], target_names=label_encoder.classes_)) # Display classification report


SVM Accuracy: 0.600110701107011
SVM Classification Report:
              precision    recall  f1-score   support

      action       0.44      0.36      0.40      1314
       adult       0.62      0.39      0.48       590
   adventure       0.55      0.22      0.31       775
   animation       0.47      0.13      0.21       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.60      0.57      7446
       crime       0.31      0.04      0.06       505
 documentary       0.69      0.86      0.77     13096
       drama       0.56      0.77      0.65     13612
      family       0.52      0.10      0.17       783
     fantasy       0.37      0.06      0.10       322
   game-show       0.84      0.61      0.71       193
     history       0.00      0.00      0.00       243
      horror       0.67      0.60      0.63      2204
       music       0.67      0.51      0.58       731
     musical       0.47      0.03      0.06       276
     mystery       0.3

# Test case

In [25]:
# Assuming the models (lr_model, nb_model, svm_model) have already been trained
# and that X_test_tfidf is the TF-IDF representation of the 'test_data'.

zoner_Description = [
    'Explosive fight scenes in the city streets',  # Action
    'A haunted mansion that traps its visitors',  # Horror
    'A brave adventurer in search of lost treasure',  # Adventure
    'A forbidden romance in the 1920s',  # Romance
    'A daring rescue mission with a love interest'  # Action
]

# Step 1: Vectorize the new test data using the same vectorizer
test_data_tfidf = vectorizer.transform(zoner_Description)  # Transform the descriptions into TF-IDF features

# Step 2: Predict genres using each model
y_pred_lr = lr_model.predict(test_data_tfidf)  # Predict using Logistic Regression
predicted_genres_lr = label_encoder.inverse_transform(y_pred_lr)  # Inverse transform to get genre names

y_pred_nb = nb_model.predict(test_data_tfidf)  # Predict using Naive Bayes
predicted_genres_nb = label_encoder.inverse_transform(y_pred_nb)  # Inverse transform to get genre names

y_pred_svm = svm_model.predict(test_data_tfidf)  # Predict using SVM
predicted_genres_svm = label_encoder.inverse_transform(y_pred_svm)  # Inverse transform to get genre names

# Step 3: Output the predicted genres
print("Predicted Genres using Logistic Regression : ", predicted_genres_lr)
print("Predicted Genres using Naive Bayes         : ", predicted_genres_nb)
print("Predicted Genres using SVM                 : ", predicted_genres_svm)
print()
for i, message in enumerate(zoner_Description):
    print(f"Story : {message}")
    print(f"Status :\tNaive Bayes Prediction         : {predicted_genres_nb[i]}")
    print(f"\t\tLogistic Regression Prediction : {predicted_genres_lr[i]}")
    print(f"\t\tSVM Prediction                 : {predicted_genres_svm[i]}")
    print("="*100)  # Separates each message



Predicted Genres using Logistic Regression :  ['documentary' 'horror' 'adventure' 'drama' 'comedy']
Predicted Genres using Naive Bayes         :  ['documentary' 'horror' 'documentary' 'drama' 'drama']
Predicted Genres using SVM                 :  ['documentary' 'horror' 'adventure' 'drama' 'comedy']

Story : Explosive fight scenes in the city streets
Status :	Naive Bayes Prediction         : documentary
		Logistic Regression Prediction : documentary
		SVM Prediction                 : documentary
Story : A haunted mansion that traps its visitors
Status :	Naive Bayes Prediction         : horror
		Logistic Regression Prediction : horror
		SVM Prediction                 : horror
Story : A brave adventurer in search of lost treasure
Status :	Naive Bayes Prediction         : documentary
		Logistic Regression Prediction : adventure
		SVM Prediction                 : adventure
Story : A forbidden romance in the 1920s
Status :	Naive Bayes Prediction         : drama
		Logistic Regression Predict