In [2]:
# -----------------------------
# 1️⃣ Import Required Libraries
# -----------------------------

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# -----------------------------
# 2️⃣ Load the Dataset
# -----------------------------

df = pd.read_csv(
    "/content/train_data.txt",
    sep=" ::: ",
    engine="python",
    names=["ID", "Genre", "Description"]
)

print("Dataset Loaded Successfully!")
print(df.head())


# -----------------------------
# 3️⃣ Separate Input and Output
# -----------------------------

X = df["Description"]   # Text data
y = df["Genre"]         # Target labels


# -----------------------------
# 4️⃣ Convert Text to Numerical Form (TF-IDF)
# -----------------------------

vectorizer = TfidfVectorizer(stop_words='english')

X_tfidf = vectorizer.fit_transform(X)

print("Text Converted to TF-IDF Features")


# -----------------------------
# 5️⃣ Split Data into Train & Test
# -----------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42
)

print("Data Split Completed")


# -----------------------------
# 6️⃣ Train the Model (Naive Bayes)
# -----------------------------

model = MultinomialNB()

model.fit(X_train, y_train)

print("Model Training Completed")


# -----------------------------
# 7️⃣ Make Predictions
# -----------------------------

y_pred = model.predict(X_test)


# -----------------------------
# 8️⃣ Evaluate Model
# -----------------------------

accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


# -----------------------------
# 9️⃣ Test with Custom Input
# -----------------------------

sample = ["A detective investigates a mysterious murder case in a small town"]

sample_vector = vectorizer.transform(sample)

prediction = model.predict(sample_vector)

print("\nCustom Prediction:")
print("Predicted Genre:", prediction[0])


Dataset Loaded Successfully!
                                 ID     Genre  \
1      Oscar et la dame rose (2009)     drama   
2                      Cupid (1997)  thriller   
3  Young, Wild and Wonderful (1980)     adult   
4             The Secret Sin (1915)     drama   
5            The Unrecovered (2007)     drama   

                                         Description  
1  Listening in to a conversation between his doc...  
2  A brother and sister with a past incestuous re...  
3  As the bus empties the students for their fiel...  
4  To help their unemployed father make ends meet...  
5  The film's title refers not only to the un-rec...  
Text Converted to TF-IDF Features
Data Split Completed
Model Training Completed
Model Accuracy: 0.445540901964401

Classification Report:

              precision    recall  f1-score   support

      action       0.00      0.00      0.00       263
       adult       0.00      0.00      0.00       112
   adventure       0.00      0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import os
print(os.listdir('/content/'))

['.config', 'sample_data']
