# User-Based Collaborative Filtering - Cosine Similarity

## 1. Set-up
import dependent packages and declare consts

In [1]:
# package initialization

import pandas as pd
import numpy as np
from surprise import BaselineOnly, KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from sklearn import metrics

In [18]:
# consts

# DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

## 2. Read Dataset
read dataset in csv format into pandas.DataFrame

In [19]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [20]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'rtime', 'rquote', 'iid', 'rrate', 'rid', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'iprice', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (129964, 28)


## 3. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item

In [5]:
def sort_by_time(df):
    
    # here we use 'rid' for sorting becaz it's auto incrementing
    return df.sort_values(by=['rid'], ascending=True)

In [6]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [7]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [8]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate']]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)

In [21]:
# dataset preprocessing

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (80722, 3)
validation set size:  (11532, 3)
test set size:  (29517, 3)


## 4. Load into Surprise

In [22]:
reader = Reader(rating_scale=(0, 1))
train_dataset = Dataset.load_from_df(train_df, reader).build_full_trainset()

## 5. Model declaration & Fitting

Collaborative Filtering - KNN:

\begin{equation}
\hat{r}_{ui} = \frac{
\sum\limits_{j \in N^k_u(i)} \text{sim}(i, j) \cdot r_{uj}}
{\sum\limits_{j \in N^k_u(i)} \text{sim}(i, j)}
\end{equation}

**Parameters**
- `k` – The max number of neighbors to take into account for aggregation, defaults to `40`

In [11]:
# compute consine similarity between users
sim_options = {'name': 'cosine', 'user_based': True}

K_MAX = 40

In [12]:
algorithm = KNNBasic(k=K_MAX, sim_options=sim_options)

In [13]:
algorithm.fit(train_dataset)

Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1ce93a1bd88>

## 6. Prediction

In [11]:
def predict(algorithm, dataframe):
    z = []
    y = []
    for i in range(len(dataframe)):
        user = dataframe.iloc[i][0]
        item = dataframe.iloc[i][1]
        rating = dataframe.iloc[i][2]
        prediction = algorithm.predict(user, item, r_ui=rating, verbose=False)
        
        if prediction.details['was_impossible'] == False:
            z.append(prediction.est)
            y.append(rating)
    return (np.array(z, dtype=np.float32), np.array(y, dtype=np.int))

In [15]:
validation_z, validation_y = predict(algorithm, validation_df)

In [16]:
test_z, test_y = predict(algorithm, test_df)

## 7. Evaluation

AUC metric

In [12]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

In [18]:
print("validation AUC: ", evaluate_auc(validation_z, validation_y))
print("test AUC: ", evaluate_auc(test_z, test_y))

validation AUC:  0.9990755580317022
test AUC:  0.9965578358481358


LogLoss metric

In [13]:
# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zz = np.ones((z.shape[0], 2))
    zz[:, 0] -= z
    zz[:, 1] = z
    return metrics.log_loss(y, zz)

In [20]:
print("validation LogLoss: ", evaluate_logloss(validation_z, validation_y))
print("test LogLoss: ", evaluate_logloss(test_z, test_y))

validation LogLoss:  0.030589202335382898
test LogLoss:  0.07819096228111398


NDCG metric

In [14]:
# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), np.expand_dims(z, axis=0), k=5)

In [22]:
print("validation NDCG@5: ", evaluate_ndcg(validation_z, validation_y))
print("test NDCG@5: ", evaluate_ndcg(test_z, test_y))

validation NDCG@5:  0.9998972356386803
test NDCG@5:  0.9991284022027651


## Experiments

In [15]:
def train_ks(start=3, end=20, step=1):

    history = []
    for k in range(start, end+step, step):
        
        print("Using K =", k)
        sim_options = {'name': 'cosine', 'user_based': False}
        algorithm = KNNBasic(k=k, sim_options=sim_options)
        
        algorithm.fit(train_dataset)
        validation_z, validation_y = predict(algorithm, validation_df)
        test_z, test_y = predict(algorithm, test_df)
        
        history.append({
            'k': k,
            'val_auc': evaluate_auc(validation_z, validation_y),
            'test_auc': evaluate_auc(test_z, test_y),
            'val_logloss': evaluate_logloss(validation_z, validation_y),
            'test_logloss': evaluate_logloss(test_z, test_y),
            'val_ndcg': evaluate_ndcg(validation_z, validation_y),
            'test_ndcg': evaluate_ndcg(test_z, test_y)
        })
    return history

In [23]:
history = train_ks(start=3, end=30, step=1)

Using K = 3
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 4
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 5
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 6
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 7
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 8
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 9
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 10
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 11
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 12
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 13
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 14
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 15
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 16
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 17
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 18
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 19
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 20
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 21
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 22
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 23
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 24
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 25
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 26
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 27
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 28
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 29
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Using K = 30
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.


In [24]:
print("| K | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| k={} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['k'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

| K | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| k=3 | 0.87087 | 0.19725 | 0.99563 | 0.95051 | 0.10281 | 0.99832 |
| k=4 | 0.89208 | 0.19477 | 0.99567 | 0.95869 | 0.09371 | 0.99834 |
| k=5 | 0.89979 | 0.19321 | 0.99570 | 0.96278 | 0.09050 | 0.99835 |
| k=6 | 0.90913 | 0.18589 | 0.99591 | 0.96559 | 0.09036 | 0.99835 |
| k=7 | 0.90873 | 0.19237 | 0.99571 | 0.96730 | 0.08763 | 0.99835 |
| k=8 | 0.91617 | 0.18576 | 0.99590 | 0.96846 | 0.08754 | 0.99835 |
| k=9 | 0.91793 | 0.19190 | 0.99571 | 0.96914 | 0.08741 | 0.99835 |
| k=10 | 0.92028 | 0.19175 | 0.99571 | 0.96951 | 0.08984 | 0.99835 |
| k=11 | 0.92248 | 0.19133 | 0.99572 | 0.96983 | 0.08977 | 0.99836 |
| k=12 | 0.92454 | 0.19118 | 0.99572 | 0.97015 | 0.08969 | 0.99836 |
| k=13 | 0.92502 | 0.19118 | 0.99572 | 0.97046 | 0.08964 | 0.99836 |
| k=14 | 0.92838 | 0.18447 | 0.99592 | 0.97330 | 0.08467 | 0.99851 |
| k=15 | 0.92553 | 0.19097 | 

## Experiment Results

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| **k=3*** | 0.49596 | 2.91065 | 0.91598 | 0.50762 | 2.85695 | 0.91625 |
| k=4 | 0.48061 | 2.96409 | 0.91064 | 0.48836 | 2.85471 | 0.90929 |
| k=5 | 0.47056 | 2.97350 | 0.90730 | 0.47924 | 2.84698 | 0.90612 |
| k=6 | 0.45843 | 3.00391 | 0.90385 | 0.47298 | 2.83998 | 0.90377 |
| k=7 | 0.45564 | 2.98360 | 0.90260 | 0.46843 | 2.83392 | 0.90174 |
| k=8 | 0.44970 | 2.98132 | 0.90084 | 0.46186 | 2.82954 | 0.89948 |
| k=9 | 0.44448 | 2.98455 | 0.89912 | 0.45849 | 2.82626 | 0.89813 |
| k=10 | 0.44197 | 2.97846 | 0.89787 | 0.45565 | 2.82376 | 0.89705 |

NYC-R dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| k=3 | 0.87087 | 0.19725 | 0.99563 | 0.95051 | 0.10281 | 0.99832 |
| k=4 | 0.89208 | 0.19477 | 0.99567 | 0.95869 | 0.09371 | 0.99834 |
| k=5 | 0.89979 | 0.19321 | 0.99570 | 0.96278 | 0.09050 | 0.99835 |
| k=6 | 0.90913 | 0.18589 | 0.99591 | 0.96559 | 0.09036 | 0.99835 |
| k=7 | 0.90873 | 0.19237 | 0.99571 | 0.96730 | 0.08763 | 0.99835 |
| k=8 | 0.91617 | 0.18576 | 0.99590 | 0.96846 | 0.08754 | 0.99835 |
| k=9 | 0.91793 | 0.19190 | 0.99571 | 0.96914 | 0.08741 | 0.99835 |
| k=10 | 0.92028 | 0.19175 | 0.99571 | 0.96951 | 0.08984 | 0.99835 |
| k=11 | 0.92248 | 0.19133 | 0.99572 | 0.96983 | 0.08977 | 0.99836 |
| k=12 | 0.92454 | 0.19118 | 0.99572 | 0.97015 | 0.08969 | 0.99836 |
| k=13 | 0.92502 | 0.19118 | 0.99572 | 0.97046 | 0.08964 | 0.99836 |
| k=14 | 0.92838 | 0.18447 | 0.99592 | 0.97330 | 0.08467 | 0.99851 |
| k=15 | 0.92553 | 0.19097 | 0.99572 | 0.97259 | 0.08953 | 0.99843 |
| k=16 | 0.92698 | 0.19084 | 0.99572 | 0.97322 | 0.08702 | 0.99843 |
| k=17 | 0.92698 | 0.19080 | 0.99572 | 0.97322 | 0.08700 | 0.99843 |
| k=18 | 0.92740 | 0.19087 | 0.99572 | 0.97329 | 0.08699 | 0.99843 |
| k=19 | 0.92787 | 0.19078 | 0.99572 | 0.97335 | 0.08696 | 0.99843 |
| k=20 | 0.92832 | 0.19074 | 0.99572 | 0.97342 | 0.08694 | 0.99843 |
| k=21 | 0.92832 | 0.19073 | 0.99572 | 0.97355 | 0.08691 | 0.99843 |
| k=22 | 0.92878 | 0.19070 | 0.99572 | 0.97374 | 0.08689 | 0.99843 |
| k=23 | 0.92878 | 0.19069 | 0.99572 | 0.97386 | 0.08687 | 0.99843 |
| k=24 | 0.92878 | 0.19068 | 0.99572 | 0.97386 | 0.08687 | 0.99843 |
| k=25 | 0.92922 | 0.19064 | 0.99572 | 0.97386 | 0.08687 | 0.99843 |
| k=26 | 0.92921 | 0.19065 | 0.99572 | 0.97392 | 0.08685 | 0.99843 |
| k=27 | 0.92921 | 0.19065 | 0.99572 | 0.97392 | 0.08686 | 0.99843 |
| k=28 | 0.93009 | 0.19058 | 0.99572 | 0.97404 | 0.08685 | 0.99843 |
| k=29 | 0.93009 | 0.19058 | 0.99572 | 0.97404 | 0.08685 | 0.99843 |
| **k=30*** | 0.93009 | 0.19058 | 0.99572 | 0.97404 | 0.08684 | 0.99843 |