In [1]:
# Load datasets
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

# Merge ratings with movies
merged_data = pd.merge(ratings, movies, on='movieId')

# Create a user-item matrix
user_movie_ratings = merged_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Split the data into train and test sets
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a TF-IDF Vectorizer for movie genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Function to get content-based recommendations
def get_content_based_recommendations(movie_id, num_recommendations=5):
    cosine_sim = cosine_similarity(tfidf_matrix[movies.index[movies['movieId'] == movie_id]], tfidf_matrix)
    similar_indices = cosine_sim[0].argsort()[-num_recommendations-1:-1][::-1]
    return movies['movieId'].iloc[similar_indices].tolist()

In [3]:
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Function to create KNN model
def create_knn_model():
    return NearestNeighbors(metric='cosine')

# Function to create Random Forest model
def create_random_forest_model():
    return RandomForestRegressor()

# Function to create SVM model
def create_svm_model():
    return make_pipeline(StandardScaler(), SVR(kernel='linear'))

In [5]:
from sklearn.pipeline import Pipeline

# Define the pipelines
knn_pipeline = Pipeline([
    ('knn', create_knn_model())
])

rf_pipeline = Pipeline([
    ('rf', create_random_forest_model())
])

svm_pipeline = Pipeline([
    ('svm', create_svm_model())
])

In [7]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import GridSearchCV

# Parameter grid for KNN
knn_param_grid = {
    'knn__n_neighbors': [5, 10, 15, 20],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Parameter grid for Random Forest
rf_param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
}

# Parameter grid for SVM
svm_param_grid = {
    'svm__kernel': ['linear', 'rbf'],
    'svm__C': [0.1, 1, 10]
}

# Set up GridSearchCV for KNN
knn_grid_search = GridSearchCV(knn_pipeline, knn_param_grid, cv=3, scoring='neg_mean_squared_error')
knn_grid_search.fit(user_movie_ratings)

# Set up GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=3, scoring='neg_mean_squared_error')
rf_grid_search.fit(train_data[['userId', 'movieId']], train_data['rating'])

# Set up GridSearchCV for SVM
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=3, scoring='neg_mean_squared_error')
svm_grid_search.fit(train_data[['userId', 'movieId']], train_data['rating'])

# Get the best parameters for each model
best_knn_params = knn_grid_search.best_params_
best_rf_params = rf_grid_search.best_params_
best_svm_params = svm_grid_search.best_params_

In [8]:
def evaluate_model(model, test_data):
    # Predict ratings based on the trained model
    predictions = model.predict(test_data[['userId', 'movieId']])
    return mean_squared_error(test_data['rating'], predictions, squared=False)

# Evaluate KNN
knn_rmse = evaluate_model(knn_grid_search.best_estimator_, test_data)
print(f"RMSE for KNN model: {knn_rmse:.3f}")

# Evaluate Random Forest
rf_rmse = evaluate_model(rf_grid_search.best_estimator_, test_data)
print(f"RMSE for Random Forest model: {rf_rmse:.3f}")

# Evaluate SVM
svm_rmse = evaluate_model(svm_grid_search.best_estimator_, test_data)
print(f"RMSE for SVM model: {svm_rmse:.3f}")

RMSE for KNN model: 0.978
RMSE for Random Forest model: 0.863
RMSE for SVM model: 1.025


In [9]:
import joblib

# Save the best models
joblib.dump(knn_grid_search.best_estimator_, 'knn_model.pkl')
joblib.dump(rf_grid_search.best_estimator_, 'rf_model.pkl')
joblib.dump(svm_grid_search.best_estimator_, 'svm_model.pkl')

from IPython.display import FileLink

# Create download links for the models
FileLink('knn_model.pkl')  # For KNN model
FileLink('rf_model.pkl')    # For Random Forest model
FileLink('svm_model.pkl')   # For SVM model
