<a href="https://colab.research.google.com/github/Ayushi-bhutani/Movie_genre_classification/blob/main/movie_genre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Print a few lines from the text files to inspect the format
def print_sample_lines(file_path, num_lines=5):
    with open(file_path, 'r') as file:
        for i in range(num_lines):
            print(file.readline().strip())

# Inspect sample lines from each file
print("Sample lines from train_data.txt:")
print_sample_lines('train_data.txt')

print("\nSample lines from test_data.txt:")
print_sample_lines('test_data.txt')

print("\nSample lines from test_data_solution.txt:")
print_sample_lines('test_data_solution.txt')


Sample lines from train_data.txt:
1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
2 ::: Cupid (1997) ::: thriller ::: A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.
3 ::: Young, Wild and Wonderful (1980) ::: adult ::: As the bus empties the students for their field trip to the Museum of Natural History, little does

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re

# Load the dataset with error handling
try:
    df_train = pd.read_csv('train_data.csv', on_bad_lines='skip')
    df_test = pd.read_csv('test_data.csv', on_bad_lines='skip')
    df_test_solution = pd.read_csv('test_data_solution.csv', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")

# Print column names to debug
print("Train Data Columns:", df_train.columns)
print("Test Data Columns:", df_test.columns)
print("Test Data Solution Columns:", df_test_solution.columns)



Train Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
Test Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
Test Data Solution Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')


In [None]:
import pandas as pd

# Define file paths
train_data_txt_path = 'train_data.txt'
test_data_txt_path = 'test_data.txt'
test_data_solution_txt_path = 'test_data_solution.txt'

# Process text files with robust splitting
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = []
    for line in lines:
        parts = line.strip().split(' ::: ', 3)  # Limit split to 4 parts
        # Handle cases where fewer parts exist
        if len(parts) < 4:
            parts.extend([''] * (4 - len(parts)))
        data.append(parts)
    return pd.DataFrame(data, columns=['index', 'title', 'genre', 'plot_summary'])

# Process files
train_data = process_file(train_data_txt_path)
test_data = process_file(test_data_txt_path)
test_data_solution = process_file(test_data_solution_txt_path)

# Print DataFrames to debug
print("Train Data Columns:", train_data.columns)
print(train_data.head())

print("Test Data Columns:", test_data.columns)
print(test_data.head())

print("Test Data Solution Columns:", test_data_solution.columns)
print(test_data_solution.head())

# Define output CSV file paths
train_data_csv_path = 'train_data.csv'
test_data_csv_path = 'test_data.csv'
test_data_solution_csv_path = 'test_data_solution.csv'

# Save DataFrame to CSV
train_data.to_csv(train_data_csv_path, index=False)
test_data.to_csv(test_data_csv_path, index=False)
test_data_solution.to_csv(test_data_solution_csv_path, index=False)

print(f'Files saved to {train_data_csv_path}, {test_data_csv_path}, and {test_data_solution_csv_path}')


Train Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
  index                             title     genre  \
0     1      Oscar et la dame rose (2009)     drama   
1     2                      Cupid (1997)  thriller   
2     3  Young, Wild and Wonderful (1980)     adult   
3     4             The Secret Sin (1915)     drama   
4     5            The Unrecovered (2007)     drama   

                                        plot_summary  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Test Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
  index                        title  \
0     1         Edgar's Lunch (1998)   
1     2     La guerra de papá (1977)   
2     3  Off the Beaten Track (2010)   
3     

In [None]:
# Inspect saved CSV files
print("Inspecting saved train_data.csv:")
df_train = pd.read_csv('train_data.csv')
print(df_train.head())

print("\nInspecting saved test_data.csv:")
df_test = pd.read_csv('test_data.csv')
print(df_test.head())

print("\nInspecting saved test_data_solution.csv:")
df_test_solution = pd.read_csv('test_data_solution.csv')
print(df_test_solution.head())


Inspecting saved train_data.csv:
   index                             title     genre  \
0      1      Oscar et la dame rose (2009)     drama   
1      2                      Cupid (1997)  thriller   
2      3  Young, Wild and Wonderful (1980)     adult   
3      4             The Secret Sin (1915)     drama   
4      5            The Unrecovered (2007)     drama   

                                        plot_summary  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Inspecting saved test_data.csv:
   index                        title  \
0      1         Edgar's Lunch (1998)   
1      2     La guerra de papá (1977)   
2      3  Off the Beaten Track (2010)   
3      4       Meu Amigo Hindu (2015)   
4      5            Er nu zhai (1955)   

                    

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
if 'plot_summary' not in df_train.columns or 'genre' not in df_train.columns:
    print("Required columns are missing in the training data.")
    exit()
if 'plot_summary' not in df_test.columns:
    print("Required columns are missing in the test data.")
    exit()
if 'genre' not in df_test_solution.columns:
    print("Required columns are missing in the test solution data.")
    exit()

In [None]:
print("Train Data Columns:", train_data.columns)
print(train_data.head())

print("Test Data Columns:", test_data.columns)
print(test_data.head())

print("Test Data Solution Columns:", test_data_solution.columns)
print(test_data_solution.head())

Train Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
  index                             title     genre  \
0     1      Oscar et la dame rose (2009)     drama   
1     2                      Cupid (1997)  thriller   
2     3  Young, Wild and Wonderful (1980)     adult   
3     4             The Secret Sin (1915)     drama   
4     5            The Unrecovered (2007)     drama   

                                        plot_summary  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Test Data Columns: Index(['index', 'title', 'genre', 'plot_summary'], dtype='object')
  index                        title  \
0     1         Edgar's Lunch (1998)   
1     2     La guerra de papá (1977)   
2     3  Off the Beaten Track (2010)   
3     

In [None]:
def preprocess_text(text):
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

In [None]:
required_columns = ['plot_summary', 'genre']

In [None]:
# Convert all values to strings and fill missing values with an empty string
df_train['plot_summary'] = df_train['plot_summary'].astype(str).fillna('')
df_test['plot_summary'] = df_test['plot_summary'].astype(str).fillna('')


In [None]:
# Identify rows with non-string types or NaNs
non_string_train = df_train[df_train['plot_summary'].apply(lambda x: not isinstance(x, str))]
non_string_test = df_test[df_test['plot_summary'].apply(lambda x: not isinstance(x, str))]

print("Non-string entries in train data:\n", non_string_train)
print("Non-string entries in test data:\n", non_string_test)


Non-string entries in train data:
 Empty DataFrame
Columns: [index, title, genre, plot_summary]
Index: []
Non-string entries in test data:
 Empty DataFrame
Columns: [index, title, genre, plot_summary]
Index: []


In [None]:
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        return text.strip()
    return ""

# Apply preprocessing
df_train['plot_summary'] = df_train['plot_summary'].apply(preprocess_text)
df_test['plot_summary'] = df_test['plot_summary'].apply(preprocess_text)


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list, max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['plot_summary'])
X_test_tfidf = tfidf_vectorizer.transform(df_test['plot_summary'])

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in nltk.pos_tag(tokens)]
        text = ' '.join(tokens)
    return text


In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df_train['genre'])
y_test_encoded = label_encoder.transform(df_test_solution['genre'])  # Encode test labels


In [None]:
nb_model = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy')

In [None]:
grid_search.fit(X_train_tfidf, y_train_encoded)

In [None]:
from sklearn.model_selection import GridSearchCV

# Example for Naive Bayes
nb_model = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train_encoded)

print(f'Best parameters for Naive Bayes: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

best_nb_model = grid_search.best_estimator_
nb_pred = best_nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy with Grid Search:")
print(accuracy_score(y_test_encoded, nb_pred))
print(classification_report(y_test_encoded, nb_pred, target_names=label_encoder.classes_))


Best parameters for Naive Bayes: {'alpha': 0.1}
Best cross-validation score: 0.5622348339023289
Naive Bayes Accuracy with Grid Search:
0.2511439114391144
              precision    recall  f1-score   support

      action       0.00      0.00      0.00      1314
       adult       0.00      0.00      0.00       590
   adventure       0.00      0.00      0.00       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.00      0.00      0.00      7446
       crime       0.00      0.00      0.00       505
 documentary       0.00      0.00      0.00     13096
       drama       0.25      1.00      0.40     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.00      0.00      0.00       193
     history       0.00      0.00      0.00       243
      horror       0.00      0.00      0.00      2204
       music       0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
