In [1]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/772.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3156217 sha256=ac70a8d4698904257c956046725a2eb6db8ee8422e22eac8329180c9f4fb2dbb
  Stored in directory: /root/.cache/pip/wheels/a5/

In [10]:
pip install scikit-surprise



In [11]:
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

In [14]:
df= pd.read_csv('/content/jokes-data.csv')
df.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


DATA PREPROCESSING

In [15]:
#dropping the datas with a rating of 9.0

df= df[df["Rating"]!=9.0]

#sorting the datas

df= df.sort_values(by= ['user_id', 'joke_id'])

df= df.reset_index(drop=True)


In [19]:
reader= Reader(rating_scale= (10,-10))

data= Dataset.load_from_df(df[['user_id', 'joke_id', 'Rating']], reader)

Training the model

In [24]:
# determining the optimal algorithm parameters with GridSearchCV

sim_options= {
    "name": ["msd", "cosine"],
    "min_support": [3,4,5],
    "user_based": [False],
}

param_grid= {"sim_options": sim_options}

gs= GridSearchCV(KNNWithMeans, param_grid, measures= ["rmse", "mae"], cv=2)
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [25]:
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

9.769151420452015
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}


In [27]:
algo= gs.best_estimator['rmse']
trainset= data.build_full_trainset()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x796a55377250>

In [28]:
uid=1
iid=1


pred= algo.predict(uid, iid, r_ui=7.82, verbose=True)

user: 1          item: 1          r_ui = 7.82   est = 10.00   {'actual_k': 40, 'was_impossible': False}


In [29]:
uid=24983
iid=87

pred= algo.predict(uid, iid, r_ui=7.23, verbose=True)

user: 24983      item: 87         r_ui = 7.23   est = 10.00   {'actual_k': 40, 'was_impossible': False}


In [30]:
trainset, testset = train_test_split(data, test_size=0.2)
algo_test = gs.best_estimator['rmse']
predictions = algo_test.fit(trainset).test(testset)
accuracy.rmse(predictions)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 9.7681


9.768079785640284

In [31]:
def get_num_user_ratings(uid):

    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0

def get_num_item_ratings(iid):

    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

In [32]:
trainset = algo_test.trainset
predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
predictions_df['# of user ratings'] = predictions_df.uid.apply(get_num_user_ratings)
predictions_df['# of item ratings'] = predictions_df.iid.apply(get_num_item_ratings)
predictions_df['error'] = abs(predictions_df.est - predictions_df.rui)
best_predictions = predictions_df.sort_values(by='error')[:10]
worst_predictions = predictions_df.sort_values(by='error')[-10:]

In [33]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,# of user ratings,# of item ratings,error
167566,18563,95,10.0,10,"{'actual_k': 28, 'was_impossible': False}",28,11730,0.0
35824,9796,116,10.0,10,"{'actual_k': 40, 'was_impossible': False}",70,8863,0.0
10957,14911,124,10.0,10,"{'actual_k': 38, 'was_impossible': False}",38,8480,0.0
109836,29646,81,10.0,10,"{'actual_k': 40, 'was_impossible': False}",72,5863,0.0
27033,22315,82,10.0,10,"{'actual_k': 23, 'was_impossible': False}",23,5772,0.0
16321,36302,98,10.0,10,"{'actual_k': 40, 'was_impossible': False}",66,8917,0.0
160131,26233,40,10.0,10,"{'actual_k': 14, 'was_impossible': False}",14,8852,0.0
153631,18004,56,10.0,10,"{'actual_k': 16, 'was_impossible': False}",16,8560,0.0
115447,30270,37,10.0,10,"{'actual_k': 36, 'was_impossible': False}",36,8093,0.0
133089,901,125,10.0,10,"{'actual_k': 40, 'was_impossible': False}",70,5530,0.0


In [34]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,# of user ratings,# of item ratings,error
124131,30357,2,-10.0,10,"{'actual_k': 14, 'was_impossible': False}",14,21759,20.0
20572,35932,5,-10.0,10,"{'actual_k': 13, 'was_impossible': False}",13,21833,20.0
142120,7282,8,-10.0,10,"{'actual_k': 6, 'was_impossible': False}",6,22054,20.0
1564,23181,4,-10.0,10,"{'actual_k': 12, 'was_impossible': False}",12,21794,20.0
71709,40318,31,-10.0,10,"{'actual_k': 40, 'was_impossible': False}",67,3562,20.0
163113,10871,132,-10.0,10,"{'actual_k': 40, 'was_impossible': False}",68,4801,20.0
12253,35911,134,-10.0,10,"{'actual_k': 18, 'was_impossible': False}",18,5196,20.0
13171,33532,2,-10.0,10,"{'actual_k': 24, 'was_impossible': False}",24,21759,20.0
163703,2483,2,-10.0,10,"{'actual_k': 40, 'was_impossible': False}",62,21759,20.0
41665,23305,123,-10.0,10,"{'actual_k': 40, 'was_impossible': False}",62,4948,20.0


In [35]:

def get_top_n(predictions, n=5):


    # First map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Predicting ratings for all pairs (u, i) that are NOT in the training set
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions)

# Printing the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
34511 [39, 105, 25, 9, 3]
2314 [39, 105, 25, 3, 117]
852 [39, 105, 25, 9, 3]
20675 [39, 105, 25, 9, 117]
36282 [39, 105, 9, 117, 6]
27479 [39, 105, 25, 9, 3]
11118 [39, 105, 117, 44, 96]
2518 [39, 25, 9, 3, 117]
34504 [39, 105, 25, 9, 117]
3841 [39, 105, 25, 117, 6]
25517 [39, 105, 25, 117, 104]
36817 [39, 105, 25, 117, 103]
2082 [39, 105, 25, 9, 3]
35648 [39, 105, 25, 3, 117]
14571 [39, 105, 25, 9, 117]
5167 [105, 25, 117, 6, 104]
21834 [39, 105, 25, 117, 104]
9266 [39, 105, 25, 104, 44]
18995 [39, 105, 25, 9, 3]
24536 [39, 105, 25, 117, 6]
14253 [39, 105, 3, 117, 6]
17282 [39, 105, 3, 117, 6]
21500 [39, 105, 25, 117, 6]
20300 [39, 105, 25, 9, 3]
2366 [39, 105, 25, 3, 117]
9512 [39, 105, 25, 117, 44]
7515 [39, 105, 25, 9, 117]
1717 [39, 105, 25, 9, 3]
32150 [39, 105, 25, 44, 103]
38799 [39, 105, 25, 3, 117]
5317 [39, 105, 25, 9, 3]
28514 [39, 105, 25, 117, 104]
15267 [39, 105, 25, 117, 104]
39327 [105, 25, 3, 117, 44]
84