<a href="https://colab.research.google.com/github/Arky001/Movie-Genre-Classification/blob/main/Movie_Genre_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Google Drive**

In [1]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


**Load Dataset**

In [2]:
# Dataset files from Google Drive
train_path = '/content/drive/MyDrive/Genre-Classification-Dataset/train_data.txt'
test_path = '/content/drive/MyDrive/Genre-Classification-Dataset/test_data.txt'
solution_path = '/content/drive/MyDrive/Genre-Classification-Dataset/test_data_solution.txt'


**Import Required Libraries**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Load and Preprocess Data**

In [6]:
# Load training data (with ID, TITLE, GENRE, DESCRIPTION)
train_df = pd.read_csv(train_path, sep=':::', engine='python', names=['ID', 'Title', 'Genre', 'Description'])

# Load test data (with ID, TITLE, DESCRIPTION)
test_df = pd.read_csv(test_path, sep=':::', engine='python', names=['ID', 'Title', 'Description'])

# Load test solution data (with ID, GENRE)
solution_df = pd.read_csv(solution_path, sep=':::', engine='python', names=['ID', 'Genre'])

# Display first few rows of each
print("Train Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())

print("\nTest Data Solution:")
print(solution_df.head())


Train Data:
   ID                               Title       Genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         Description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

Test Data:
   ID                          Title  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                         Description  
0   L.R

**Text Cleaning + Genre Splitting**

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Basic text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

# Apply cleaning to descriptions
train_df['Cleaned_Description'] = train_df['Description'].apply(clean_text)
test_df['Cleaned_Description'] = test_df['Description'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Genre Preprocessing (for train data)**

In [8]:
# Handle multiple genres — split by comma and strip
train_df['Genre_List'] = train_df['Genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# Initialize MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_df['Genre_List'])

# Show all genre classes
print("Genre classes:", mlb.classes_)

Genre classes: ['action' 'adult' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'game-show' 'history' 'horror'
 'music' 'musical' 'mystery' 'news' 'reality-tv' 'romance' 'sci-fi'
 'short' 'sport' 'talk-show' 'thriller' 'war' 'western']


**Train/Validation Split**

In [9]:
from sklearn.model_selection import train_test_split

# Features and Labels
X = train_df['Cleaned_Description']
y = mlb.transform(train_df['Genre_List'])  # ensure correct format

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**TF-IDF Vectorization**

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit vectorizer on training text only
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_df['Cleaned_Description'])

**Model Training and Evaluation**

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train logistic regression
lr_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
lr_clf.fit(X_train_vec, y_train)

# Predict on validation data
y_val_pred_lr = lr_clf.predict(X_val_vec)

# Evaluation
print("Logistic Regression Classification Report:")
print(classification_report(y_val, y_val_pred_lr, target_names=mlb.classes_))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

      action       0.86      0.05      0.09       263
       adult       0.78      0.06      0.12       112
   adventure       0.43      0.04      0.08       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.74      0.29      0.42      1443
       crime       0.00      0.00      0.00       107
 documentary       0.81      0.67      0.73      2659
       drama       0.68      0.46      0.55      2697
      family       1.00      0.02      0.04       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.23      0.37        40
     history       0.00      0.00      0.00        45
      horror       0.82      0.32      0.46       431
       music       0.67      0.23      0.34       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Multinomial Naive Bayes**

In [12]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = OneVsRestClassifier(MultinomialNB())
nb_clf.fit(X_train_vec, y_train)

y_val_pred_nb = nb_clf.predict(X_val_vec)

print("Multinomial Naive Bayes Classification Report:")
print(classification_report(y_val, y_val_pred_nb, target_names=mlb.classes_))


Multinomial Naive Bayes Classification Report:
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       263
       adult       0.00      0.00      0.00       112
   adventure       0.00      0.00      0.00       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.69      0.09      0.16      1443
       crime       0.00      0.00      0.00       107
 documentary       0.75      0.64      0.69      2659
       drama       0.67      0.37      0.47      2697
      family       0.00      0.00      0.00       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.00      0.00      0.00        40
     history       0.00      0.00      0.00        45
      horror       1.00      0.04      0.08       431
       music       1.00      0.01      0.03       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest (Slower but may be tried)

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_clf.fit(X_train_vec, y_train)

y_val_pred_rf = rf_clf.predict(X_val_vec)

print("Random Forest Classification Report:")
print(classification_report(y_val, y_val_pred_rf, target_names=mlb.classes_))


Random Forest Classification Report:
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       263
       adult       1.00      0.02      0.04       112
   adventure       0.33      0.01      0.01       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.76      0.14      0.23      1443
       crime       1.00      0.01      0.02       107
 documentary       0.84      0.51      0.63      2659
       drama       0.74      0.15      0.25      2697
      family       0.80      0.03      0.05       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.12      0.22        40
     history       0.00      0.00      0.00        45
      horror       0.73      0.02      0.04       431
       music       1.00      0.03      0.07       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Final Step (Optional but Recommended for Completion)**

In [14]:
# Predict genres for test data using the best model (e.g., Logistic Regression)
test_pred = lr_clf.predict(X_test_vec)

# Convert predictions back to genre labels
predicted_genres = mlb.inverse_transform(test_pred)

# Add predictions to the test DataFrame
test_df['Predicted Genres'] = predicted_genres

# Save to CSV (for GitHub or analysis)
test_df[['ID', 'Predicted Genres']].to_csv('submission.csv', index=False)

# View first few rows
test_df[['ID', 'Predicted Genres']].head()


Unnamed: 0,ID,Predicted Genres
0,1,()
1,2,"(drama,)"
2,3,"(documentary,)"
3,4,"(drama,)"
4,5,()
