In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 
              'comedy','crime','documentary','family','fantasy',
              'game-show','history','horror','music','musical',
              "mystery",'news','reality-tv','romance','scifi',
              'sport','talk-show','thriller','war','western' ]
fallback_genre = 'Unknown'

In [3]:
try:
    with tqdm(total=100, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv('train_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(100)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise


Loading Train Data: 100%|███████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 172.68it/s]


In [4]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(',') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
X_train

0         listening in to a conversation between his do...
1         a brother and sister with a past incestuous r...
2         as the bus empties the students for their fie...
3         to help their unemployed father make ends mee...
4         the film's title refers not only to the un-re...
                               ...                        
54209     this short-lived nbc live sitcom centered on ...
54210     the next generation of exploitation. the sist...
54211     ze bestaan echt, is a stand-up comedy about g...
54212     walter and vivian live in the country and hav...
54213     on labor day weekend, 1935, the most intense ...
Name: MOVIE_PLOT, Length: 54214, dtype: object

In [6]:
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000)),
    ('classifier', MultiOutputClassifier(MultinomialNB()))
])


In [7]:
count_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=5000)),
    ('classifier', MultiOutputClassifier(MultinomialNB()))
])

In [8]:
with tqdm(total=100, desc="Training Models") as pbar:
    tfidf_pipeline.fit(X_train, y_train)
    count_pipeline.fit(X_train, y_train)
    pbar.update(100)

Training Models: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.79it/s]


In [9]:
# Load the test data
try:
    with tqdm(total=100, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('test_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(100)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise


Loading Test Data: 100%|████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 238.30it/s]


In [10]:
# Data preprocessing for test data
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())


In [11]:
# Predict on the test data
with tqdm(total=100, desc="Predicting on Test Data") as pbar:
    tfidf_pred = tfidf_pipeline.predict(X_test)
    count_pred = count_pipeline.predict(X_test)
    pbar.update(100)


Predicting on Test Data: 100%|███████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.34it/s]


In [12]:
# Evaluate the models
tfidf_accuracy = accuracy_score(y_train, tfidf_pipeline.predict(X_train))
tfidf_precision = precision_score(y_train, tfidf_pipeline.predict(X_train), average='micro')
tfidf_recall = recall_score(y_train, tfidf_pipeline.predict(X_train), average='micro')
tfidf_f1 = f1_score(y_train, tfidf_pipeline.predict(X_train), average='micro')

In [13]:
count_accuracy = accuracy_score(y_train, count_pipeline.predict(X_train))
count_precision = precision_score(y_train, count_pipeline.predict(X_train), average='micro')
count_recall = recall_score(y_train, count_pipeline.predict(X_train), average='micro')
count_f1 = f1_score(y_train, count_pipeline.predict(X_train), average='micro')

In [None]:
import csv

In [14]:
# Print the evaluation metrics
print("TF-IDF Vectorizer Model Evaluation:")
print(f"Accuracy: {tfidf_accuracy:.2f}")
print(f"Precision: {tfidf_precision:.2f}")
print(f"Recall: {tfidf_recall:.2f}")
print(f"F1-score: {tfidf_f1:.2f}")


TF-IDF Vectorizer Model Evaluation:
Accuracy: 0.28
Precision: 0.72
Recall: 0.28
F1-score: 0.40


In [15]:
print("\nCount Vectorizer Model Evaluation:")
print(f"Accuracy: {count_accuracy:.2f}")
print(f"Precision: {count_precision:.2f}")
print(f"Recall: {count_recall:.2f}")
print(f"F1-score: {count_f1:.2f}")


Count Vectorizer Model Evaluation:
Accuracy: 0.19
Precision: 0.32
Recall: 0.78
F1-score: 0.45


In [29]:
metrics = {
    "Accuracy": tfidf_accuracy * 100,
    "Precision": tfidf_precision,
    "Recall": tfidf_recall,
    "F1-score":tfidf_f1
}


In [34]:
# Opening the CSV file in append mode
with open("movie_genre_model_evaluation.csv", "a", newline='', encoding="utf-8") as output_file:
    csv_writer = csv.writer(output_file)
    output_file.write("\n\nModel Evaluation Metrics: \n")
    for metric, value in metrics.items():
        output_file.write(f"{metric}: {value:.2f}\n")