In [None]:
# model_exploration.ipynb
# Model Exploration for Multi-Product Recommendation System

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load processed datasets
def load_json_data(file_path):
    return pd.read_json(file_path)

# Load datasets
books_df = load_json_data('../data/processed/processed_books.json')
songs_df = load_json_data('../data/processed/processed_songs.json')
movies_df = load_json_data('../data/processed/processed_movies.json')
clothes_df = load_json_data('../data/processed/processed_clothes.json')

# Step 3: Preprocess data for modeling
# For simplicity, we'll use books dataset for this notebook. You can replicate for other datasets.

# Select important features and target
books_df['is_highly_rated'] = books_df['average_rating'] > 4.0
X_books = books_df[['ratings_count', 'release_year', 'page_count']]
y_books = books_df['is_highly_rated']

# Handle missing values (if any)
X_books.fillna(X_books.mean(), inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_books, y_books, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Model Selection and Training
# We'll start with several models: Logistic Regression, SVM, KNN, and RandomForest

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

# Support Vector Machine (SVM)
svm_clf = SVC()
svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)
print("SVM Performance:")
print(classification_report(y_test, y_pred_svm))

# K-Nearest Neighbors (KNN)
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_scaled, y_train)
y_pred_knn = knn_clf.predict(X_test_scaled)
print("KNN Performance:")
print(classification_report(y_test, y_pred_knn))

# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

# Step 5: Hyperparameter Tuning using GridSearchCV
# Let's tune Random Forest and SVM with GridSearchCV to find optimal parameters

# Random Forest Hyperparameter Tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print("Best Random Forest Parameters:", grid_search_rf.best_params_)
best_rf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Best Random Forest Performance:")
print(classification_report(y_test, y_pred_best_rf))

# SVM Hyperparameter Tuning
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 10]
}
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train_scaled, y_train)

print("Best SVM Parameters:", grid_search_svm.best_params_)
best_svm = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm.predict(X_test_scaled)
print("Best SVM Performance:")
print(classification_report(y_test, y_pred_best_svm))

# Step 6: Evaluate Model Performance
# Compare the performance of the tuned models

models = ['Logistic Regression', 'SVM', 'KNN', 'Random Forest', 'Tuned RF', 'Tuned SVM']
accuracies = [
    accuracy_score(y_test, y_pred_lr),
    accuracy_score(y_test, y_pred_svm),
    accuracy_score(y_test, y_pred_knn),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_best_rf),
    accuracy_score(y_test, y_pred_best_svm)
]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Comparison: Accuracy')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.show()

# Step 7: Conclusion
# After trying multiple models, we can see that the Random Forest and Tuned Random Forest provide the best performance.
# Based on these results, we would consider using the Random Forest model for the recommendation system's core prediction task.
