# Collaborative Filtering and Hybrid Recommender System


In [2]:
import numpy as np
import pandas as pd

from surprise import Dataset, SVD, Reader, model_selection
from surprise.model_selection import cross_validate, KFold

from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
moviesData = pd.read_csv("../data/NewMoviesMetadata.csv")

In [4]:
movie_com = pd.read_csv("../data/MovieBasedRecommenderData.csv")

In [5]:
links = pd.read_csv("../data/links.csv").dropna()
links["tmdbId"] = links["tmdbId"].astype("int64")

In [6]:
reader = Reader()
ratings = pd.read_csv("../data/ratings_small.csv")

In [7]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8936  0.8933  0.8999  0.8942  0.8982  0.8959  0.0027  
MAE (testset)     0.6874  0.6872  0.6919  0.6891  0.6934  0.6898  0.0025  
Fit time          0.59    0.53    0.50    0.51    0.52    0.53    0.03    
Test time         0.11    0.06    0.06    0.11    0.06    0.08    0.03    


{'test_rmse': array([0.89363286, 0.89325089, 0.8999437 , 0.89423122, 0.89823211]),
 'test_mae': array([0.68743875, 0.68721097, 0.69194576, 0.68909482, 0.69339287]),
 'fit_time': (0.588336706161499,
  0.5319705009460449,
  0.5037281513214111,
  0.5127511024475098,
  0.515866756439209),
 'test_time': (0.11266064643859863,
  0.05774354934692383,
  0.06286931037902832,
  0.10944461822509766,
  0.05880379676818848)}

In [8]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cb83f166e0>

In [9]:
movie_com["model_feature"] = movie_com["model_feature"].fillna("")

tf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=1,
    stop_words="english"
)

tfidf_matrix = tf.fit_transform(movie_com["model_feature"])
tfidf_matrix.shape

(45453, 450099)

In [10]:
cosine_sim = linear_kernel(tfidf_matrix[:15000], tfidf_matrix)

In [11]:
indices = pd.Series(moviesData.index, index=moviesData['title'])
movie_id = links[["movieId", "tmdbId"]]
movie_id.to_csv("../data/movie_id.csv", index=False)

In [12]:
def hybrid(userId, title):
    idx = indices[title]

    try:
        sim_scores = list(enumerate(cosine_sim[int(idx[0])]))
    except:
        sim_scores = list(enumerate(cosine_sim[int(idx)]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:21]
    movie_indices = [i[0] for i in sim_scores]

    movies = movie_com.iloc[movie_indices][["title", "id"]]
    tempLinks = links[links["tmdbId"].isin(movies["id"].tolist())]

    preds = [svd.predict(userId, item).est for item in tempLinks["movieId"]]

    tempData = pd.DataFrame({
        "title": movies["title"],
        "est": preds,
        "id": movies["id"]
    })

    tempData = tempData.sort_values("est", ascending=False)
    tempData = tempData[tempData["est"] >= 2.5]

    return tempData.head(10)


In [13]:
def getUserRecommender(userID):
    tempRatings = ratings[ratings["userId"] == userID]
    tempRatings = tempRatings[tempRatings["rating"] >= 4]

    tempLinks = links[links["tmdbId"].isin(tempRatings["movieId"].tolist())]
    titlesData = moviesData[moviesData["id"].isin(tempLinks["tmdbId"])]

    resultDataFrame = pd.DataFrame()

    for title in titlesData["title"]:
        resultDataFrame = pd.concat([hybrid(userID, title), resultDataFrame], ignore_index=True)

    return resultDataFrame.sort_values("est", ascending=False)

In [14]:
hybrid(4, "Lumumba")

Unnamed: 0,title,est,id
1593,Deceiver,4.427872,14583
17383,Dreamkiller,4.381692,36663
14748,Defenseless,4.318567,41787
37110,404: Error Not Found,4.194964,71481
575,Café au Lait,4.184341,47507
19668,Alex Cross,4.146009,94348
9588,A Show of Force,4.146009,26426
16407,Bad Ronald,4.146009,56787
13073,JCVD,4.146009,13672
42974,Population Zero,4.146009,408219


In [15]:
hybrid(1, "Leaving Las Vegas")

Unnamed: 0,title,est,id
208,The Browning Version,3.30532,49805
1086,Drunks,3.250819,116844
6294,Barton Fink,2.937608,290
493,Mr. Jones,2.883664,2625
2676,Airplane!,2.866538,813
35721,Aşk Kırmızı,2.811172,210408
13709,Shall We Kiss?,2.797021,10541
7083,La Strada,2.783242,405
9721,Frozen Land,2.744549,18279
5210,The Temp,2.707615,17168


In [16]:
getUserRecommender(1)

Unnamed: 0,title,est,id
0,Cold Water,2.905315,64901
1,Les Destinées sentimentales,2.809735,64310
2,Lumumba,2.707615,55551
3,Dil Dosti Etc,2.707615,20454
4,Stevie,2.707615,51927
5,The Blue Room,2.707615,266034
6,Delbaran,2.707615,43765
7,Under The Domim Tree,2.707615,117730
8,The Saddest Boy in the World,2.707615,118015
9,Bolivia,2.707615,49913


In [None]:
import pickle
import joblib

print("Saving models and data...")

# Save the trained SVD model
with open('svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)
print(" SVD model saved")

# Save the cosine similarity matrix (first 15000 rows only to manage file size)
cosine_sim_subset = cosine_sim[:15000] if cosine_sim.shape[0] > 15000 else cosine_sim
with open('cosine_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim_subset, f)
print(" Cosine similarity matrix saved")

# Save the TF-IDF matrix (optional, for future use)
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)
print(" TF-IDF matrix saved")

# Save indices for movie titles
indices = pd.Series(moviesData.index, index=moviesData['title'])
with open('movie_indices.pkl', 'wb') as f:
    pickle.dump(indices, f)
print(" Movie indices saved")

# Save movie_id mapping
movie_id_mapping = links[["movieId", "tmdbId"]]
movie_id_mapping.to_csv("../data/movie_id_mapping.csv", index=False)
print(" Movie ID mapping saved")

# Create a preprocessed dataset for faster loading
preprocessed_data = {
    'movies_data': moviesData,
    'movie_com': movie_com,
    'ratings': ratings,
    'links': links
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)
print(" Preprocessed data saved")

print("\n All models and data saved successfully!")
print("Files created:")
print("- svd_model.pkl")
print("- cosine_similarity.pkl") 
print("- tfidf_matrix.pkl")
print("- movie_indices.pkl")
print("- movie_id_mapping.csv")
print("- preprocessed_data.pkl")

Saving models and data...
 SVD model saved
 Cosine similarity matrix saved
 TF-IDF matrix saved
 Movie indices saved
 Movie ID mapping saved
 Preprocessed data saved

 All models and data saved successfully!
Files created:
- svd_model.pkl
- cosine_similarity.pkl
- tfidf_matrix.pkl
- movie_indices.pkl
- movie_id_mapping.csv
- preprocessed_data.pkl
