<a href="https://colab.research.google.com/github/Deelaw15/Movie-Recommendation-System/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Modelling - KNNBasic, KNNWithMeans and KNNBaseline

## Collaborative filtering - Comparing all 3 KNN Models

In [14]:
from surprise import Dataset, Reader, KNNBasic, KNNWithMeans, KNNBaseline
import joblib
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import time

In [15]:
train = pd.read_csv('train_movies_rating.csv')
train.head()

Unnamed: 0,movieId,title,genres,userId,rating,dt,year,(no genres listed),Action,Adventure,...,Romance,Sci-Fi,Thriller,War,Western,avg_movie_rating,rating_count,user_avg_rating,user_rating_count,user_std_rating
0,804,She's the One,"['Comedy', 'Romance']",1,4.0,2000-07-30 18:08:19,1996.0,0,0,0,...,1,0,0,0,0,3.25,8,4.367965,231,0.80142
1,1210,Star Wars: Episode VI - Return of the Jedi,"['Action', 'Adventure', 'Sci-Fi']",1,5.0,2000-07-30 18:08:19,1983.0,0,1,1,...,0,1,0,0,0,4.137755,196,4.367965,231,0.80142
2,2018,Bambi,"['Animation', 'Children', 'Drama']",1,5.0,2000-07-30 18:08:43,1942.0,0,0,0,...,0,0,0,0,0,3.361111,36,4.367965,231,0.80142
3,2628,Star Wars: Episode I - The Phantom Menace,"['Action', 'Adventure', 'Sci-Fi']",1,4.0,2000-07-30 18:08:43,1999.0,0,1,1,...,0,1,0,0,0,3.086957,138,4.367965,231,0.80142
4,2826,"13th Warrior, The","['Action', 'Adventure', 'Fantasy']",1,4.0,2000-07-30 18:08:43,1999.0,0,1,1,...,0,0,0,0,0,2.903846,26,4.367965,231,0.80142


In [16]:
# Function to compare KNN models

def compare_knn_models(df, algo_class, rating_scale=(0.5, 5)):
    # Load data into Surprise format
    reader = Reader(rating_scale=rating_scale)
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    
    # Split data into training and testing sets
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
    
    results = []
    sim_options_list = [
        {'name': 'cosine', 'user_based': True},
        {'name': 'cosine', 'user_based': False},
        {'name': 'pearson', 'user_based': True},
        {'name': 'pearson', 'user_based': False}
    ]

    for sim_opt in sim_options_list:
        sim_name = sim_opt['name']
        mode = 'User-based' if sim_opt['user_based'] else 'Item-based'
        start = time.time()

        algo = algo_class(sim_options=sim_opt)
        algo.fit(trainset)
        predictions = algo.test(testset)
        rmse = accuracy.rmse(predictions, verbose=False)

        results.append({
            'Model': algo_class.__name__,
            'Similarity': sim_name,
            'Mode': mode,
            'RMSE': round(rmse, 4),
            'Train Time (s)': round(time.time() - start, 2)
        })

    return pd.DataFrame(results)

In [17]:
# Example for KNNWithMeans baseline
results_knnmeans = compare_knn_models(train, KNNWithMeans)

# View result table
print(results_knnmeans)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
          Model Similarity        Mode    RMSE  Train Time (s)
0  KNNWithMeans     cosine  User-based  0.9051            2.09
1  KNNWithMeans     cosine  Item-based  0.9126           23.90
2  KNNWithMeans    pearson  User-based  0.9006            1.52
3  KNNWithMeans    pearson  Item-based  0.9119           15.99


In [18]:
results = pd.concat([
    compare_knn_models(train, KNNBasic),
    compare_knn_models(train, KNNWithMeans),
    compare_knn_models(train, KNNBaseline)
])

results.sort_values(by='RMSE', ascending=True)
results.reset_index(drop=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity ma

Unnamed: 0,Model,Similarity,Mode,RMSE,Train Time (s)
0,KNNBasic,cosine,User-based,0.9828,1.18
1,KNNBasic,cosine,Item-based,0.9799,11.22
2,KNNBasic,pearson,User-based,0.9821,1.16
3,KNNBasic,pearson,Item-based,0.9754,13.25
4,KNNWithMeans,cosine,User-based,0.9051,1.13
5,KNNWithMeans,cosine,Item-based,0.9126,10.06
6,KNNWithMeans,pearson,User-based,0.9006,1.3
7,KNNWithMeans,pearson,Item-based,0.9119,12.49
8,KNNBaseline,cosine,User-based,0.8835,1.59
9,KNNBaseline,cosine,Item-based,0.8947,10.58


In [22]:
# Saving the best model based on lowest RMSE

best = results.sort_values('RMSE').iloc[0]

# Map text name -> class
algo_map = {'KNNBasic': KNNBasic, 'KNNWithMeans': KNNWithMeans, 'KNNBaseline': KNNBaseline}
AlgoClass = algo_map[best['Model']]
sim_name = best['Similarity']
user_based = (best['Mode'] == 'User-based')

# build and train on full train set (no test split) to get final model
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
full_trainset = data.build_full_trainset()

algo = AlgoClass(sim_options={'name': sim_name, 'user_based': user_based})
algo.fit(full_trainset)

# save fitted model
joblib.dump(algo, r"d:\Data\Movie-Recommendation-System\best_knn_model.pkl")
print("✅ Fitted best model saved to d:\\Data\\Movie-Recommendation-System\\best_knn_model.pkl")

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
✅ Fitted best model saved to d:\Data\Movie-Recommendation-System\best_knn_model.pkl


In [23]:
print("🏆 Best Model Selected:")
print(f"  Model: {best['Model']}")
print(f"  Similarity: {best['Similarity']}")
print(f"  Mode: {best['Mode']}")
print(f"  RMSE: {best['RMSE']:.4f}")


🏆 Best Model Selected:
  Model: KNNBaseline
  Similarity: pearson
  Mode: User-based
  RMSE: 0.8818
