In [12]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split

If using anaconda, use ***conda install -c conda-forge scikit-surprise*** to install surprise package.

Surprise is *Simple Python Recommendation System Engine* it can use KNN as baseline model

In [7]:
ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'movie_id', 'rating', 'timestamp'],
    engine='python'
)
users = pd.read_csv(
    'ml-1m/users.dat',
    sep='::',
    names=['user_id', 'gender', 'age', 'occupation', 'zip'],
    engine='python'
)
movies = pd.read_csv(
    'ml-1m/movies.dat',
    sep='::',
    names=['movie_id', 'title', 'genres'],
    engine='python',
    encoding='latin-1'
)

data = pd.merge(ratings, users, on='user_id', how='left')
data = pd.merge(data, movies, on='movie_id', how='left')

In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


### Setting up Recommendation system (Surprise)

In [13]:
df_ratings = ratings[['user_id', 'movie_id', 'rating']].copy()
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df_ratings, reader)

train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

In [14]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

model_knn = KNNBasic(sim_options=sim_options)
model_knn.fit(train_set)
pred_knn = model_knn.test(test_set)
rmse_knn = accuracy.rmse(pred_knn)
print('KNN RMSE:', rmse_knn)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9988
KNN RMSE: 0.9988005727431559


In [15]:
model_svd = SVD()
model_svd.fit(train_set)
pred_svd = model_svd.test(test_set)
rmse_svd = accuracy.rmse(pred_svd)
print('SVD RMSE:', rmse_svd)

RMSE: 0.8731
SVD RMSE: 0.8730587435087336


### Data Sparsity 

In [19]:
fractions = [0.1, 0.3, 0.5, 0.7]
results = {}

for f in fractions:
    df_sample = df_ratings.sample(frac=f, random_state=42)
    dataset_sample = Dataset.load_from_df(df_sample, reader)
    train_set_sample, test_set_sample = train_test_split(dataset_sample, test_size=0.2, random_state=42)
    
    model_knn = KNNBasic(sim_options=sim_options)
    model_knn.fit(train_set_sample)
    pred_knn = model_knn.test(test_set_sample)
    rmse_knn = accuracy.rmse(pred_knn)
    results[f] = rmse_knn

for f,val in results.items():
    print(f'Fraction: {f} RMSE: {val}')


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0994
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0479
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0206
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0090
Fraction: 0.1 RMSE: 1.0994407902790586
Fraction: 0.3 RMSE: 1.0479165161276993
Fraction: 0.5 RMSE: 1.020628624860206
Fraction: 0.7 RMSE: 1.0090022068848437


### Cold start

In [31]:
df = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'movie_id', 'rating', 'timestamp'],
    engine='python'
)

df = df[['user_id', 'movie_id', 'rating']] # drop timestamp for now

In [32]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df, reader)
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [35]:
test_df = pd.DataFrame(test_set, columns=['user_id', 'movie_id', 'rating'])
new_users = [
    (6041,1,5),
    (6041,50,1),
    (6042,10,1),
    (6042,100,5),
    (6043,33,2),
    (6043,55,5),
    (6044,66,1),
    (6044,77,5)     
    ]

new_users_df = pd.DataFrame(new_users, columns=['user_id', 'movie_id', 'rating'])
test_df = pd.concat([test_df, new_users_df], ignore_index=True)


In [36]:
test_set_new = list(test_df[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None))

model_knn = KNNBasic(sim_options=sim_options)
model_knn.fit(train_set)
pred_knn = model_knn.test(test_set_new)
rmse_knn = accuracy.rmse(pred_knn)

new_user_ids = {6041, 6042, 6043, 6044}
#filter out predictions for new users
preds_new = [p for p in pred_knn if p.uid in new_user_ids]
preds_old = [p for p in pred_knn if p.uid not in new_user_ids]

rmse_new = accuracy.rmse(preds_new)
rmse_old = accuracy.rmse(preds_old)
print('RMSE for new users:', rmse_new)
print('RMSE for old users:', rmse_old)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9989
RMSE: 1.9539
RMSE: 0.9988
RMSE for new users: 1.9539083898500522
RMSE for old users: 0.9988005727431559
