<a href="https://colab.research.google.com/github/AVANTHIREDDY1214/CODSOFT.1/blob/main/movie_genre_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# ----------------------
# 1. Load Dataset
# ----------------------
# Expected format: CSV/TSV with columns ["plot", "genre"] for train_data.txt
# test_data.txt: contains ["plot"]
# test_data_solution.txt: contains ["genre"]

# Read the data, specifying no header and using ":::" as separator, then split into columns
train_df = pd.read_csv("train_data.txt", sep=':::', engine='python', header=None, names=['id', 'title', 'genre', 'plot'])
test_df = pd.read_csv("test_data.txt", sep=':::', engine='python', header=None, names=['id', 'title', 'plot'])
test_labels = pd.read_csv("test_data_solution.txt", sep=':::', engine='python', header=None, names=['id', 'title', 'genre', 'plot'])


print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)
print(train_df.head())

# ----------------------
# 2. Preprocessing
# ----------------------
# Use only 'plot' and 'genre' for training data, and 'plot' for test data
train_df.dropna(subset=["plot", "genre"], inplace=True)
test_df.dropna(subset=["plot"], inplace=True)

# Align test_labels with test_df by ID
test_labels = test_labels[test_labels['id'].isin(test_df['id'])]

# Filter test_df to keep only the rows with IDs present in the filtered test_labels
test_df = test_df[test_df['id'].isin(test_labels['id'])]


X_train = train_df["plot"]
y_train = train_df["genre"]

X_test = test_df["plot"]
y_test = test_labels["genre"]  # true genres

# ----------------------
# 3. Feature Extraction with TF-IDF
# ----------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ----------------------
# 4. Models
# ----------------------
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": LinearSVC()
}

# ----------------------
# 5. Training & Evaluation
# ----------------------
for name, model in models.items():
    print(f"\n----- {name} -----")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# ----------------------
# 6. Example Prediction
# ----------------------
example_plot = ["A young boy discovers he has magical powers and attends a school of wizardry."]
example_vec = vectorizer.transform(example_plot)
pred_genre = models["Logistic Regression"].predict(example_vec)
print("\nExample Prediction:", pred_genre[0])

Train Shape: (54214, 4)
Test Shape: (54200, 3)
   id                               title       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                                plot  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

----- Naive Bayes -----
Accuracy: 0.5239483394833948


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.54      0.11      0.18      1314
       adult        0.50      0.06      0.11       590
   adventure        0.81      0.07      0.13       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.51      0.42      0.46      7446
       crime        0.00      0.00      0.00       505
 documentary        0.57      0.87      0.69     13096
       drama        0.46      0.82      0.59     13612
      family        0.50      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.98      0.32      0.48       193
     history        0.00      0.00      0.00       243
      horror        0.69      0.36      0.47      2204
       music        0.74      0.15      0.25       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.00      0.00       318
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.48      0.29      0.36      1314
       adult        0.60      0.24      0.34       590
   adventure        0.59      0.17      0.26       775
   animation        0.52      0.07      0.12       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.58      0.55      7446
       crime        0.37      0.04      0.08       505
 documentary        0.67      0.85      0.75     13096
       drama        0.54      0.77      0.64     13612
      family        0.50      0.09      0.15       783
     fantasy        0.56      0.06      0.10       322
   game-show        0.90      0.51      0.65       193
     history        0.00      0.00      0.00       243
      horror        0.64      0.57      0.60      2204
       music        0.67      0.45      0.54       731
     musical        0.33      0.02      0.04       276
     mystery        0.36      0.02      0.03       318
        n