In [56]:
import pandas as pd
rating_csv = "ratings.csv"
rating_df = pd.read_csv(rating_csv)

In [57]:
rating_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0


In [58]:
#Note that this matrix is presented as the dense or vertical form, and you may convert it to a sparse matrix using pivot :
rating_sparse_df = rating_df.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)
rating_sparse_df.head()


Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,3.0,0.0,0.0,3.0,2.0,0.0,2.0,2.0,...,0.0,2.0,0.0,3.0,0.0,2.0,2.0,0.0,3.0,0.0
1,4,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,...,0.0,2.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0
2,5,2.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,...,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
#Implementation Option 1: Use Surprise library 

In [None]:
!pip install scikit-surprise==1.1.1

In [59]:
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy


In [60]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k', prompt=False)


In [61]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)


In [62]:
# We'll use the famous KNNBasic algorithm.
algo = KNNBasic()


In [63]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)


Computing the msd similarity matrix...
Done computing similarity matrix.


In [64]:
# Then compute RMSE# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9808


0.9808236064676727

In [65]:
#let's load our own course rating dataset
rating_df.to_csv("course_ratings.csv", index=False)

In [66]:
# Read the course rating dataset with columns user item rating
reader = Reader(
        line_format='user item rating', sep=',', skip_lines=1, rating_scale=(2, 3))
coruse_dataset = Dataset.load_from_file("course_ratings.csv", reader=reader)


In [67]:
#We split it into trainset and testset:
trainset, testset = train_test_split(coruse_dataset, test_size=.3)

In [68]:
#check how many users and items we can use to fit a KNN model
print(f"Total {trainset.n_users} users and {trainset.n_items} items in the trainingset")

Total 31329 users and 126 items in the trainingset


In [69]:
#TASK: Perform KNN-based collaborative filtering on the user-item interaction matrix
#TODO: Fit the KNN-based collaborative filtering model using the trainset and evaluate the results using the testset:

In [70]:
#Define a KNNBasic() model
model_knn = KNNBasic()


In [71]:
# Define the KNNBasic model with cosine similarity measure
model_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})


In [72]:
#Fit the model on the trainset
model_knn.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1f98543c880>

In [73]:
#Use the testset to make predictions
predictions = model_knn.test(testset)

In [74]:
#Calculate RMSE
rmse = accuracy.rmse(predictions)

RMSE: 0.1937


In [75]:
#Implementation Option 2: Use numpy, pandas, and sklearn

In [111]:
import pandas as pd

df = pd.read_csv("course_ratings.csv")

df

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0
...,...,...,...
233301,1540125,DS0101EN,3.0
233302,1250651,PY0101EN,3.0
233303,1003832,CB0105ENv1,3.0
233304,922065,BD0141EN,3.0


In [112]:
rating_sparse_df = df.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)

rating_sparse_df

Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,3.0,0.0,0.0,3.0,2.0,0.0,2.0,2.0,...,0.0,2.0,0.0,3.0,0.0,2.0,2.0,0.0,3.0,0.0
1,4,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,...,0.0,2.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0
2,5,2.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,...,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33896,2102054,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33897,2102356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33898,2102680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33899,2102983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute', n_neighbors=30, n_jobs=-1)

model_knn.fit(rating_sparse_df)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=30)

In [98]:
import numpy as np
distances, indices = model_knn.kneighbors(rating_sparse_df.to_numpy(), n_neighbors=30)



In [99]:
distances



array([[0.00000000e+00, 4.29825472e-01, 4.53433699e-01, ...,
        8.86858818e-01, 8.89077104e-01, 8.89817181e-01],
       [0.00000000e+00, 4.29825472e-01, 5.27622507e-01, ...,
        6.86899361e-01, 6.90611026e-01, 6.91491224e-01],
       [0.00000000e+00, 4.53433699e-01, 5.89023528e-01, ...,
        6.85619350e-01, 6.85716520e-01, 6.85727375e-01],
       ...,
       [1.11022302e-16, 4.52426985e-12, 4.56978899e-12, ...,
        7.26696481e-12, 7.57782725e-12, 7.58948460e-12],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [100]:
indices

array([[    0,     1,     2, ...,    12,    41,    77],
       [    1,     0,     5, ...,    31,    77,    41],
       [    2,     0,     1, ...,    90,    35,    19],
       ...,
       [33898, 33742, 32411, ..., 29788, 33894, 33809],
       [33783, 33773, 33713, ..., 33781, 33778, 33779],
       [33856, 33854, 33774, ..., 33787, 33783, 33699]], dtype=int64)

In [101]:
def get_rating(userid, itemid):
    # list of k nearset neigbors to user id
    idx = rating_sparse_df[rating_sparse_df.user==userid].index.values[0]
    list_k_neighbors_indx = indices[idx,:]
    user_id= rating_sparse_df[rating_sparse_df.index.isin(list_k_neighbors_indx)].user.tolist()
    # get their corrlation with user
    corr = distances[idx,:]
    # get their rating for that itemid
    score_item = rating_sparse_df[['user',itemid]]
    selected = score_item[score_item.user.isin(user_id)][itemid].tolist()
#     r = selected[(selected.user != userid) & (selected[itemid]!=0)]
#     i = r.index
    df= pd.DataFrame({'user_id':user_id,'rating':selected,'correlation':corr})
    df = df[(df['user_id'] != userid) & (df['rating'] != 0)]
    df['score']= df.apply(lambda x: x['rating'] * x['correlation'], axis=1)
    num = df['score'].sum()
    deno = df['correlation'].sum()
    if int(deno) == 0:
        return None
    final_score = num/deno
    return final_score

In [106]:
def rmse_error(y_hat, y_true):
    y_hat = np.array(y_hat)
    y_true = np.array(y_true)
    return np.sqrt(((y_true-y_hat)**2).mean())

In [107]:
rmse_error(y_hat, y)


0.6150402642813929