In [None]:
""""
Create a machine learning model that can predict the genre of a
movie based on its plot summary or other textual information. You
can use techniques like TF-IDF or word embeddings with classifiers
such as Naive Bayes, Logistic Regression, or Support Vector Machines.

"""

'"\nCreate a machine learning model that can predict the genre of a\nmovie based on its plot summary or other textual information. You\ncan use techniques like TF-IDF or word embeddings with classifiers\n\nsuch as Naive Bayes, Logistic Regression, or Support Vector Machines.\n\n'

In [None]:
import os
files=os.listdir()  # List all files in the current directory (for debugging or file availability check)
print(files)


['.config', 'test_data.txt', 'test_data_solution.txt', 'description.txt', 'train_data.txt', 'sample_data']


In [None]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")   # Suppress warnings for cleaner output

# Attempt to load 'description.txt' as a DataFrame (although not used later)
data = pd.read_csv("description.txt")
data

Unnamed: 0,Train data:
0,ID ::: TITLE ::: GENRE ::: DESCRIPTION
1,ID ::: TITLE ::: GENRE ::: DESCRIPTION
2,ID ::: TITLE ::: GENRE ::: DESCRIPTION
3,ID ::: TITLE ::: GENRE ::: DESCRIPTION
4,Test data:
5,ID ::: TITLE ::: DESCRIPTION
6,ID ::: TITLE ::: DESCRIPTION
7,ID ::: TITLE ::: DESCRIPTION
8,ID ::: TITLE ::: DESCRIPTION
9,Source:


In [None]:
# Function to load and parse data from text files (split on ':::' delimiter)

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = f.readlines()
    data = [line.strip().split(':::') for line in data]  # Each line is split into components and stripped of newline characters

    return data


In [None]:
train_data=load_data("train_data.txt") # Load training data and convert to DataFrame
train_df=pd.DataFrame(train_data,columns=['ID','TITLE','GENRE','DESCRIPTION'])

test_data=load_data("test_data.txt") # Load test data (no genre column, to be predicted)
test_df=pd.DataFrame(test_data,columns=['ID','TITLE','DESCRIPTION'])

test_solution=load_data("test_data_solution.txt") # Load test solution data (used for evaluation)

test_solution_df=pd.DataFrame(test_solution,columns=['ID','TITLE','GENRE','DESCRIPTION'])

In [None]:

# Print DataFrames to visually verify the loaded data
print("Train data:")
train_df

Train data:


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [None]:

# Print DataFrames to visually verify the loaded data
print("Test_data:")
test_df

Test_data:


Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [None]:

# Print DataFrames to visually verify the loaded data
print("Test_solution_data:")
test_solution_df

Test_solution_data:


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [None]:
# Feature extraction using TF-IDF Vectorizer on descriptions
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# Fit the vectorizer on training data and transform both train and test data
X_train_tfidf=vectorizer.fit_transform(train_df["DESCRIPTION"])
X_test_tfidf=vectorizer.transform(test_df["DESCRIPTION"])

# Display the shape of the transformed data matrices
print(f"Training data shape:{X_train_tfidf.shape}")
print(f"Test data shape:{X_test_tfidf.shape}")



Training data shape:(54214, 10000)
Test data shape:(54200, 10000)


In [None]:
# Encode genre labels into integers for classification
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()

y_train=label_encoder.fit_transform(train_df['GENRE'])

print(f"Unique genres in the training data:{label_encoder.classes_}")



Unique genres in the training data:[' action ' ' adult ' ' adventure ' ' animation ' ' biography ' ' comedy '
 ' crime ' ' documentary ' ' drama ' ' family ' ' fantasy ' ' game-show '
 ' history ' ' horror ' ' music ' ' musical ' ' mystery ' ' news '
 ' reality-tv ' ' romance ' ' sci-fi ' ' short ' ' sport ' ' talk-show '
 ' thriller ' ' war ' ' western ']


In [None]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression(max_iter=200,solver='saga')
lr_model.fit(X_train_tfidf,y_train)

# Predict genres for test data
y_pred=lr_model.predict(X_test_tfidf)

# Convert predicted integer labels back to genre names
predicted_genres=label_encoder.inverse_transform(y_pred)

# Add predictions to test DataFrame
test_df['Predicted_Genre']=predicted_genres
test_df[['TITLE','Predicted_Genre']]


Unnamed: 0,TITLE,Predicted_Genre
0,Edgar's Lunch (1998),drama
1,La guerra de papá (1977),drama
2,Off the Beaten Track (2010),documentary
3,Meu Amigo Hindu (2015),drama
4,Er nu zhai (1955),drama
...,...,...
54195,"""Tales of Light & Dark"" (2013)",drama
54196,Der letzte Mohikaner (1965),drama
54197,Oliver Twink (2007),comedy
54198,Slipstream (1973),drama


In [None]:
test_df['Predicted_Genre']=predicted_genres

# Merge predicted genres with actual genres for evaluation
merged_df=pd.merge(test_solution_df[['ID','GENRE']],test_df[['ID','Predicted_Genre']],on='ID')
merged_df

Unnamed: 0,ID,GENRE,Predicted_Genre
0,1,thriller,drama
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama
...,...,...,...
54195,54196,horror,drama
54196,54197,western,drama
54197,54198,adult,comedy
54198,54199,drama,drama


In [None]:
# Evaluate performance of Logistic Regression
from sklearn.metrics import accuracy_score,classification_report
accuracy=accuracy_score(merged_df['GENRE'],merged_df['Predicted_Genre'])
print(f'Accuracy:{accuracy:.4f}')

# Detailed performance metrics
print("\nClassification report")
print(classification_report(merged_df['GENRE'],merged_df['Predicted_Genre']))


Accuracy:0.5945

Classification report
               precision    recall  f1-score   support

      action        0.51      0.29      0.37      1314
       adult        0.65      0.24      0.35       590
   adventure        0.67      0.16      0.25       775
   animation        0.59      0.04      0.08       498
   biography        0.00      0.00      0.00       264
      comedy        0.54      0.60      0.57      7446
       crime        0.41      0.03      0.06       505
 documentary        0.68      0.87      0.76     13096
       drama        0.55      0.79      0.65     13612
      family        0.49      0.08      0.14       783
     fantasy        0.65      0.03      0.06       322
   game-show        0.90      0.49      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.66      0.57      0.61      2204
       music        0.68      0.46      0.55       731
     musical        0.44      0.01      0.03       276
     mystery        0.33 

In [None]:
# Train a Naive Bayes model for comparison
from sklearn.naive_bayes import MultinomialNB
nb_model=MultinomialNB()
nb_model.fit(X_train_tfidf,y_train)


In [None]:
# Predict using Naive Bayes
y_pred_nb=nb_model.predict(X_test_tfidf)
predicted_genres_nb=label_encoder.inverse_transform(y_pred_nb)

# Store predictions in the DataFrame
test_df['Predicted_Genre_NB']=predicted_genres_nb

# Merge actual and predicted genres to evaluate Naive Bayes
merged_df_nb=pd.merge(test_solution_df,test_df[['ID','Predicted_Genre_NB']],on='ID')


In [None]:
from sklearn.metrics import accuracy_score,classification_report
# Accuracy and classification report for Naive Bayes
accuracy_nb=accuracy_score(merged_df_nb['GENRE'],merged_df_nb['Predicted_Genre_NB'])
print(f'Naive Bayes Accuracy: {accuracy_nb}')
print("Naive Bayes Classfication Report")
print(classification_report(merged_df_nb['GENRE'],merged_df_nb['Predicted_Genre_NB'],target_names=label_encoder.classes_))


Naive Bayes Accuracy: 0.5092435424354244
Naive Bayes Classfication Report
               precision    recall  f1-score   support

      action        0.56      0.03      0.06      1314
       adult        0.46      0.02      0.04       590
   adventure        0.77      0.04      0.08       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.40      0.46      7446
       crime        0.00      0.00      0.00       505
 documentary        0.56      0.89      0.69     13096
       drama        0.44      0.84      0.58     13612
      family        0.00      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        1.00      0.02      0.04       193
     history        0.00      0.00      0.00       243
      horror        0.77      0.23      0.35      2204
       music        0.89      0.02      0.05       731
     musical        0.00      0.00      0.00 