In [1]:
# package initialization

import pandas as pd
import numpy as np
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [2]:
# consts

DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

In [14]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [15]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'iid', 'rid', 'rimages', 'rquote', 'rrate', 'rtime', 'uprofile', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (136978, 29)


In [16]:
def sort_by_time(df):
    return df.sort_values(by=['rtime'], ascending=True)

In [17]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [18]:
def filter_none(df):
    df = df[df['rrate'] != "None"]
    return df

In [19]:
def data_preprocess(df):
    
    # sort by time (ascending order)
    df = sort_by_time(df)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate']]
    
    # filter out rows with NULL values
    df = filter_none(df)
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    test_df = df.iloc[int(len(df)*0.8):]
    train_validation_df = df.iloc[:int(len(df)*0.8)]
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)

In [20]:
# dataset preprocessing

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (63162, 3)
validation set size:  (9024, 3)
test set size:  (18047, 3)


In [31]:
reader = Reader(rating_scale=(1, 5))
train_dataset = Dataset.load_from_df(train_df, reader).build_full_trainset()

In [32]:
algorithm = BaselineOnly()

In [33]:
algorithm.fit(train_dataset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x15220822bc8>

In [36]:
def predict(algorithm, dataframe):
    z = []
    y = []
    for i in range(len(dataframe)):
        user = str(int(dataframe.iloc[i][0]))
        item = str(int(dataframe.iloc[i][1]))
        rating = dataframe.iloc[i][2]
        prediction = algorithm.predict(user, item, r_ui=rating, verbose=False)
        
        if prediction.details['was_impossible'] == False:
            z.append(prediction.est)
            y.append(rating)
    return (z, y)

In [37]:
validation_z, validation_y = predict(algorithm, validation_df)

In [38]:
test_z, test_y = predict(algorithm, test_df)