In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from surprise import Dataset, Reader, KNNBasic
from surprise import accuracy
from surprise.model_selection import train_test_split
import pickle
import os

### Content-Based Recommendation System

In [2]:
# Load content-based data
content_based_data = pd.read_csv('../data/processed/content_based_data.csv')
content_based_data.sample(7)

Unnamed: 0,movie_id,movie_title,genres
715,716,Home for the Holidays (1995),"Drama, Romance"
328,329,Desperate Measures (1998),"Crime, Drama, Thriller"
1285,1286,Shall We Dance? (1937),"Comedy, Musical, Romance"
626,627,Robin Hood: Prince of Thieves (1991),Drama
501,502,Bananas (1971),"Comedy, War"
417,418,Cinderella (1950),"Animation, Children's, Musical"
787,788,Relative Fear (1994),"Horror, Thriller"


In [3]:
# Handle missing values in the 'genres' column
content_based_data['genres'] = content_based_data['genres'].fillna('')
content_based_data['genres'] = content_based_data['genres'].astype(str)

In [4]:
# Feature extraction using TF-IDF on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(content_based_data['genres'])

In [5]:
# Calculate cosine similarity between movies based on genres
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [6]:
# Save the cosine similarity matrix
output_dir = '../models'
os.makedirs(output_dir, exist_ok=True)
cosine_sim_file = os.path.join(output_dir, 'cosine_similarity_matrix.pkl')
pd.DataFrame(cosine_sim).to_pickle(cosine_sim_file)

In [7]:
# Recommendation function
def recommend_movies(movie_title, cosine_sim=cosine_sim):
    idx = content_based_data[content_based_data['movie_title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return content_based_data['movie_title'].iloc[movie_indices]

In [8]:
recommend_movies('Toy Story (1995)').tolist()

['Aladdin and the King of Thieves (1996)',
 'Aristocats, The (1970)',
 'Pinocchio (1940)',
 'Sword in the Stone, The (1963)',
 'Fox and the Hound, The (1981)',
 'Winnie the Pooh and the Blustery Day (1968)',
 'Balto (1995)',
 'Oliver & Company (1988)',
 'Swan Princess, The (1994)',
 'Land Before Time III: The Time of the Great Giving (1995) (V)']

### Collaborative Filtering Recommendation System

In [9]:
# Load collaborative filtering data
collaborative_data = pd.read_csv('../data/processed/collaborative_filtering_data.csv')

In [10]:
# Load the dataset into Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(collaborative_data[['user_id', 'item_id', 'rating']], reader)

In [11]:
# Split the dataset into train and test
trainset, testset = train_test_split(data, test_size=0.25)

#### KNNBasic

In [12]:
# User-User Collaboration Filtering using Cosine Similarity
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1581f4d50>

In [13]:
# Test the model
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.0106


1.0106390820686117

#### Tuning for KNNBasic

In [14]:
sim_options = {
    'name': 'pearson_baseline',
    'user_based': True
}

# Tune the k and min_k values
algo = KNNBasic(k=35, min_k=2, sim_options=sim_options)

# Train and test the model again
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9969


0.9968647670039027

In [15]:
# Save the trained model for deployment
model_file = os.path.join(output_dir, 'collaborative_filtering_model.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(algo, f)

In [16]:
# Recommendation function
def recommend_for_user(user_id, algo, n=10):
    user_ratings = collaborative_data[collaborative_data['user_id'] == user_id]['item_id'].unique()
    movie_ids = collaborative_data['item_id'].unique()
    unrated_movies = [movie for movie in movie_ids if movie not in user_ratings]
    predictions = [algo.predict(user_id, movie_id) for movie_id in unrated_movies]
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_n = predictions[:n]
    movie_ids = [pred.iid for pred in top_n]
    return movie_ids

In [17]:
# Test the recommendation for a user
recommend_for_user(196, algo)

[1189, 1594, 1639, 357, 603, 134, 64, 1449, 1512, 479]