In [42]:
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans

In [2]:
ratings_dict = {
    'item':[1,2,1,2,1,2,1,2,1],
    'user':['A','A','B','B','C','C','D','D','E'],
    'rating':[1,2,2,4,2.5,4,4.5,5,3]
}

In [3]:
ratings_dict

{'item': [1, 2, 1, 2, 1, 2, 1, 2, 1],
 'user': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
 'rating': [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3]}

In [5]:
df = pd.DataFrame(ratings_dict)

In [8]:
df

Unnamed: 0,item,user,rating
0,1,A,1.0
1,2,A,2.0
2,1,B,2.0
3,2,B,4.0
4,1,C,2.5
5,2,C,4.0
6,1,D,4.5
7,2,D,5.0
8,1,E,3.0


In [7]:
reader = Reader(rating_scale=(1,5))

In [9]:
data = Dataset.load_from_df(df[['user','item','rating']],reader)

- To use item based cosine similarity
- Name contains the similarity metric to use. Options are cosine, msd, pearson or pearson_baseline. The default is msd

- user based is a boolean that tells whether the approach will be user-based or item-based. The default is true, which means the user-based approach will be used.
- min_support is the minimum number of common items needed between users to consider them for similarity
- For the item-based approach, this corresponds to the minimum number of common users for two items.

In [12]:
sim_options = {
    'name':'cosine',
    'user_based':False # Compute similarities between items
}

In [13]:
algo = KNNWithMeans(sim_options=sim_options)

# Build training data

In [14]:
training_data = data.build_full_trainset()

In [15]:
algo.fit(training_data)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fbba139d570>

In [16]:
pred = algo.predict('A',3)

In [17]:
pred

Prediction(uid='A', iid=3, r_ui=None, est=3.111111111111111, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [18]:
pred.est

3.111111111111111

In [19]:
pred = algo.predict('A',2)

In [20]:
pred

Prediction(uid='A', iid=2, r_ui=None, est=2.0738403438383686, details={'actual_k': 2, 'was_impossible': False})

In [21]:
pred.est

2.0738403438383686

---

In [23]:
sim_options = {
    'name':'pearson',
    'user_based':True # Compute similarities between items
}

In [24]:
algo = KNNWithMeans(sim_options=sim_options)

In [25]:
training_data = data.build_full_trainset()

In [26]:
algo.fit(training_data)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fbba1151480>

In [30]:
pred = algo.predict('E',2)

In [31]:
pred

Prediction(uid='E', iid=2, r_ui=None, est=3.0, details={'actual_k': 0, 'was_impossible': False})

In [32]:
pred.est

3.0

---

# Building collaborative filtering with the built-in data set movielens

In [33]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 


Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/eshaan/.surprise_data/ml-100k


In [34]:
df = pd.DataFrame(data.raw_ratings)

In [35]:
df.head()

Unnamed: 0,0,1,2,3
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [38]:
df = df[[0,1,2]]

In [40]:
df.columns = ['user_id','item_id','rating']

In [41]:
df.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


## Splitting into train and test

In [43]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [44]:
reader = Reader(rating_scale=(1,5))

In [51]:
data = Dataset.load_from_df(train,reader)

In [57]:
sim_options = {
    'name':'cosine',
    'user_based':False # Compute similarities between items
}

In [58]:
algo = KNNWithMeans(sim_options=sim_options)

In [59]:
training_data = data.build_full_trainset()

In [60]:
algo.fit(training_data)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fbb9d3e7d30>

In [67]:
pred = algo.predict('500','1600')

In [68]:
pred

Prediction(uid='500', iid='1600', r_ui=None, est=3.309642727088844, details={'actual_k': 40, 'was_impossible': False})

In [69]:
pred.est

3.309642727088844

In [71]:
predicted = []
for _, row in test.iterrows():
    predicted.append(algo.predict(row.user_id,row.item_id))

In [75]:
test['predicted'] = [i.est for i in predicted]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicted'] = [i.est for i in predicted]


In [77]:
test

Unnamed: 0,user_id,item_id,rating,predicted
0,196,242,3.0,4.140810
7,253,465,5.0,3.800859
18,291,1042,4.0,3.595976
20,119,392,4.0,4.100358
22,299,144,4.0,3.631697
...,...,...,...,...
99942,363,181,5.0,3.554705
99949,823,134,5.0,4.727691
99953,655,913,4.0,3.530915
99984,654,370,2.0,2.965667


In [84]:
test['error'] = test['rating'] - test['predicted']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['error'] = test['rating'] - test['predicted']


In [88]:
test['error'] = np.square(test['error'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['error'] = np.square(test['error'])


In [92]:
r_mean_abs_error = np.sqrt(np.mean(test['error']))

In [93]:
r_mean_abs_error

0.9384361943355847

In [87]:
test.head(10)

Unnamed: 0,user_id,item_id,rating,predicted,error
0,196,242,3.0,4.14081,-1.14081
7,253,465,5.0,3.800859,1.199141
18,291,1042,4.0,3.595976,0.404024
20,119,392,4.0,4.100358,-0.100358
22,299,144,4.0,3.631697,0.368303
23,291,118,2.0,3.592201,-1.592201
29,160,234,5.0,4.104074,0.895926
32,225,193,4.0,4.635102,-0.635102
38,276,796,1.0,3.247553,-2.247553
42,201,979,2.0,2.658667,-0.658667
