# 🎬 Movie Genre Classifier using SVM + TF-IDF
This notebook trains a Support Vector Machine (SVM) model to predict movie genres from their plot descriptions using TF-IDF.

In [None]:
# 📦 Install required libraries if needed
# !pip install pandas scikit-learn

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# 📂 Load training data
def load_train_data(filepath):
    rows = []
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                id_, title, genre, desc = parts
                rows.append((int(id_), title, genre.lower(), desc))
    return pd.DataFrame(rows, columns=["id", "title", "genre", "description"])

train_df = load_train_data("train.txt")

In [None]:
# 🧹 Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

train_df['cleaned_description'] = train_df['description'].apply(clean_text)

In [None]:
# 📊 Train/Test split
X = train_df['cleaned_description']
y = train_df['genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 🤖 Train SVM model
model = make_pipeline(
    TfidfVectorizer(max_features=10000, ngram_range=(1, 2)),
    LinearSVC(class_weight='balanced', max_iter=10000)
)
model.fit(X_train, y_train)

In [None]:
# 📈 Evaluate model
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

In [None]:
# 🔍 Load and predict on test data
def load_test_data(filepath):
    rows = []
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) == 3:
                id_, title, desc = parts
                rows.append((int(id_), title, desc))
    return pd.DataFrame(rows, columns=["id", "title", "description"])

test_df = load_test_data("data.txt")
test_df['cleaned_description'] = test_df['description'].apply(clean_text)
test_df['predicted_genre'] = model.predict(test_df['cleaned_description'])

In [None]:
# 💾 Save predictions
test_df[['id', 'title', 'predicted_genre']].to_csv("predicted_genres.csv", index=False)
print("Predictions saved to predicted_genres.csv")