In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [24]:
#load train data
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv("train_data.txt", sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 108.53it/s]


In [25]:
#list of genre
genre_list = train_data["GENRE"].unique()
print(genre_list)

[' drama ' ' thriller ' ' adult ' ' documentary ' ' comedy ' ' crime '
 ' reality-tv ' ' horror ' ' sport ' ' animation ' ' action ' ' fantasy '
 ' short ' ' sci-fi ' ' music ' ' adventure ' ' talk-show ' ' western '
 ' family ' ' mystery ' ' history ' ' news ' ' biography ' ' romance '
 ' game-show ' ' musical ' ' war ']


In [26]:
#fallback genre for movies which moel find hard to predict
fallback_genre = 'Unknown'

In [6]:
#data preprocessing
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [7]:
#vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features

In [8]:
# transform
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

Vectorizing Training Data:   0%|          | 0/50 [00:00<?, ?it/s]

Vectorizing Training Data: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]


In [9]:
#naive bayes classifier
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:00<00:00, 64.96it/s]


In [10]:
#load test data
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv("test_data.txt", sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 101.30it/s]


In [11]:
#preprocessing
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())

In [12]:
#transform
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

Vectorizing Test Data:   0%|          | 0/50 [00:00<?, ?it/s]

Vectorizing Test Data: 100%|██████████| 50/50 [00:05<00:00,  9.16it/s]


In [13]:
#predict
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

Predicting on Test Data: 100%|██████████| 50/50 [00:00<00:00, 86.08it/s]


In [14]:
# dataframe for test data with movie names and predicted genres
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})

In [15]:
#unpredicted with fallback 
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)
test_results['Serial_Number']=test_data['SerialNumber']
test_results['MOVIE_PLOT']=test_data['MOVIE_PLOT']

In [16]:
#result to output file
with open("test_data_solution.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        id=row['Serial_Number']
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        plot=row['MOVIE_PLOT']
        output_file.write(f"{id} ::: {movie_name} ::: {genre_str} ::: {plot}\n")

In [17]:
#evaluation metrics
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

In [18]:
#score
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

In [27]:
#print scores
print("\n\nModel Evaluation Metrics:\n")
print(f"Accuracy: {accuracy * 100:.2f}%\n")
print(f"Precision: {precision:.2f}\n")
print(f"Recall: {recall:.2f}\n")
print(f"F1-score: {f1:.2f}\n")



Model Evaluation Metrics:

Accuracy: 27.73%

Precision: 0.72

Recall: 0.28

F1-score: 0.40

