In [None]:
!pip install pandas
!pip install sklearn
!pip install joblib
!pip install matplotlib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.utils.multiclass import unique_labels

# Load dataset
df = pd.read_csv("data.csv")

# Step 1: Drop rows with missing descriptions or categories
df_clean = df.dropna(subset=["description", "categories"]).copy()

# Step 2: Keep only the first genre if multiple are present
df_clean["main_category"] = df_clean["categories"].apply(lambda x: x.split(";")[0].strip())

# Step 3: Keep only the top 10 most frequent genres
top_categories = df_clean["main_category"].value_counts().nlargest(10).index.tolist()
df_top10 = df_clean[df_clean["main_category"].isin(top_categories)]

# Step 4: Prepare features (X) and labels (y)
X = df_top10["description"]
y = df_top10["main_category"]

# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 5: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Step 6: Vectorize descriptions using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# === Train Models ===

# Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train_vec, y_train)
y_pred_nb = model_nb.predict(X_test_vec)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_vec, y_train)
y_pred_lr = log_reg.predict(X_test_vec)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# Support Vector Machine (SVM)
svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# === Detailed Evaluation for Naive Bayes ===
present_labels = unique_labels(y_test, y_pred_nb)
present_class_names = label_encoder.inverse_transform(present_labels)

report = classification_report(
    y_test, y_pred_nb,
    labels=present_labels,
    target_names=present_class_names,
    output_dict=True,
    zero_division=0
)
report_df = pd.DataFrame(report).transpose()
top_f1 = report_df.sort_values(by="f1-score", ascending=False).head(10)

print("\nTop 10 genres by F1-score (Naive Bayes):")
print(top_f1)

# === Confusion Matrix for Naive Bayes ===
cm = confusion_matrix(y_test, y_pred_nb, labels=present_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=present_class_names,
            yticklabels=present_class_names)
plt.xlabel("Predicted Genre")
plt.ylabel("True Genre")
plt.title("Confusion Matrix (Top 10 Genres) - Naive Bayes")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# === Compare F1-score (macro) for all models ===
f1_scores = {
    "Naive Bayes": f1_score(y_test, y_pred_nb, average="macro", zero_division=0),
    "Logistic Regression": f1_score(y_test, y_pred_lr, average="macro", zero_division=0),
    "Random Forest": f1_score(y_test, y_pred_rf, average="macro", zero_division=0),
    "SVM": f1_score(y_test, y_pred_svm, average="macro", zero_division=0)
}
# Print F1-scores
print("\n🔍 F1-score (macro) for all models:")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")

# Find the best model
best_model_name = max(f1_scores, key=f1_scores.get)
best_f1_score = f1_scores[best_model_name]
print(f"\n🏆 Best model based on macro F1-score: {best_model_name} ({best_f1_score:.4f})")

# Plot F1-score comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=list(f1_scores.keys()), y=list(f1_scores.values()), palette="viridis")
plt.title("Model Comparison by F1-score (macro)")
plt.ylabel("F1-score")
plt.ylim(0, 1)
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.show()

# === Save all models and encoders ===
joblib.dump(model_nb, "genre_classifier_nb.pkl")
joblib.dump(log_reg, "genre_classifier_logreg.pkl")
joblib.dump(rf_model, "genre_classifier_rf.pkl")
joblib.dump(svm_model, "genre_classifier_svm.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

print("\n✅ All models and preprocessing objects saved successfully.")