In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
# this data is based on the output of review_filtering
data = pd.read_csv('filtered_reviews.csv')
data

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,text,date,name,city,state,category_1,category_2,category_3
0,2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
1,3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
2,10,wfJS0o2_absa6lYYuVnz4g,qdneUwCll9ADHOp7i3wq3Q,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,This place is conveniently located at English ...,2017-05-14 14:20:27,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
3,13,YX2cFHDxlUfGnQ8bHPq4cA,QD9xhB-261YIQIFI5sRBtQ,XQfwVwDr-v0ZS3_CbbE5Xw,1.0,Not Impressed at all. Ordered a omelette and b...,2017-07-20 17:22:04,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
4,15,yT87_j7RyUTdHMuG0hvHXA,tqJxu8-N-PWiuZn30HTwng,XQfwVwDr-v0ZS3_CbbE5Xw,5.0,"Went here over the weekend, got my sugar rush...",2017-05-16 12:58:44,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259815,1123095,vQ8YcrMBv47IuMHpP0asKA,TLh0QKpUeT15PRSGI1uNWQ,kD03gNnQrHarCz97s1BNDQ,5.0,"Great, fresh and authentic tasting food! We ha...",2021-11-06 23:21:44,Mandola's Italian Kitchen,Tampa,FL,Restaurants,Grocery,Italian
259816,1123105,8wSRJxGG2Tc5vmwj4cs69w,bCn8JIdnXyEFROm9iqQVdg,kD03gNnQrHarCz97s1BNDQ,2.0,I visited on Sunday and had a bad experience w...,2021-10-31 02:38:45,Mandola's Italian Kitchen,Tampa,FL,Restaurants,Grocery,Italian
259817,1123106,sc-W-A8B1Hfy72Aujp9arw,Sp2GV7D-_JLZMPQmDanzPQ,kD03gNnQrHarCz97s1BNDQ,4.0,Came in for lunch with my friend. A bit confus...,2021-12-02 01:26:33,Mandola's Italian Kitchen,Tampa,FL,Restaurants,Grocery,Italian
259818,1123107,LF5nuJGuQYprbQZpAYJM_w,S_VUtqoT9eHYrcb4qTVdIw,kD03gNnQrHarCz97s1BNDQ,4.0,Had a great time at Mandolas for my sons birth...,2022-01-13 02:42:17,Mandola's Italian Kitchen,Tampa,FL,Restaurants,Grocery,Italian


In [3]:
def train_valid_test_split(data, m, n):

    '''
    construct rating matrix from data
    the columns of which represent business_id
    the rows of which represent user_id
    the values of whose elements represent the according ratings
    @ data: filterd_reviews 
    @ m: counts of ratings for validation
    @ n: counts of ratings for test
    '''

    # to construct sparse matrix
    # train
    train_user_id = []
    train_business_id = []
    train_stars = []
    # validation
    valid_user_id = []
    valid_business_id = []
    valid_stars = []
    # train + validation
    train_valid_user_id = []
    train_valid_business_id = []
    train_valid_stars = []
    # test
    test_user_id = []
    test_business_id = []
    test_stars = []
    
    user_id_lst = data['user_id'].unique().tolist() # rows of sparse matrix
    busi_id_lst = data['business_id'].unique().tolist() # columns of sparse matrix

    train_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    valid_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    train_valid_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    test_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))

    ranking_df = data[['user_id','business_id','stars','date']].groupby(['user_id'])
    
    for group_name, group_df in ranking_df:
        group_df = group_df.sort_values(by='date')

        # if the len(group_df) > valid_m + test_n, split the group_df as 
        # training set : group_df.iloc[:len(group_df)-m-n, :]
        # validation set : group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
        # test set : group_df.iloc[len(group_df)-n:, :]

        # otherwise, not split the group_df
        # keep the group_df as training set

        if len(group_df) > m+n: 

            training_set = group_df.iloc[:len(group_df)-m-n, :]
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

            validation_set = group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
            valid_user_id.extend(validation_set.loc[:,'user_id'].tolist()) 
            valid_business_id.extend(validation_set.loc[:,'business_id'].tolist())
            valid_stars.extend(validation_set.loc[:,'stars'].tolist())
            
            train_validation_set = group_df.iloc[:len(group_df)-n, :]
            train_valid_user_id.extend(train_validation_set.loc[:,'user_id'].tolist()) 
            train_valid_business_id.extend(train_validation_set.loc[:,'business_id'].tolist())
            train_valid_stars.extend(train_validation_set.loc[:,'stars'].tolist())

            testing_set = group_df.iloc[len(group_df)-n:, :]
            test_user_id.extend(testing_set.loc[:,'user_id'].tolist()) 
            test_business_id.extend(testing_set.loc[:,'business_id'].tolist())
            test_stars.extend(testing_set.loc[:,'stars'].tolist())

        else:
            training_set = group_df
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

    train_df = pd.DataFrame({'user_id': train_user_id, 'business_id': train_business_id, 'stars': train_stars})
    valid_df = pd.DataFrame({'user_id': valid_user_id, 'business_id': valid_business_id, 'stars': valid_stars})
    train_valid_df = pd.DataFrame({'user_id': train_valid_user_id, 'business_id': train_valid_business_id, 'stars': train_valid_stars})
    test_df = pd.DataFrame({'user_id': test_user_id, 'business_id': test_business_id, 'stars': test_stars})


    for i in range(len(train_df)):
        ratings = train_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(train_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(train_df.iloc[i, 1]) # business_id
        train_sparse_matrix[row_index, column_index] = ratings

    for i in range(len(valid_df)):
        ratings = valid_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(valid_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(valid_df.iloc[i, 1]) # business_id
        valid_sparse_matrix[row_index, column_index] = ratings
        
    for i in range(len(train_valid_df)):
        ratings = train_valid_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(train_valid_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(train_valid_df.iloc[i, 1]) # business_id
        train_valid_sparse_matrix[row_index, column_index] = ratings
        
    for i in range(len(test_df)):
        ratings = test_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(test_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(test_df.iloc[i, 1]) # business_id
        test_sparse_matrix[row_index, column_index] = ratings

    # calculate sparstiy of the matrix
    train_sparsity = 1 - np.count_nonzero(train_sparse_matrix)/ (train_sparse_matrix.shape[0] * train_sparse_matrix.shape[1])
    valid_sparsity = 1 - np.count_nonzero(valid_sparse_matrix)/ (valid_sparse_matrix.shape[0] * valid_sparse_matrix.shape[1])
    train_valid_sparsity = 1 - np.count_nonzero(train_valid_sparse_matrix)/ (train_valid_sparse_matrix.shape[0] * train_valid_sparse_matrix.shape[1])
    test_sparsity = 1 - np.count_nonzero(test_sparse_matrix)/ (test_sparse_matrix.shape[0] * test_sparse_matrix.shape[1])

    train_sparsity *= 100
    valid_sparsity *=100
    train_valid_sparse_matrix *= 100
    test_sparsity *= 100

    print (f'{len(user_id_lst)} users')
    print (f'{len(busi_id_lst)} business')

    print (f'Train_rating_matrix Sparsity: {round(train_sparsity,4)}%')
    print (f'Valid_rating_matrix Sparsity: {round(valid_sparsity,4)}%')
    print(f'Test_rating_matrix Sparsity:  {round(test_sparsity,4)}%')


    return train_sparse_matrix, valid_sparse_matrix, train_valid_sparse_matrix, test_sparse_matrix, \
           train_df, valid_df, train_valid_df, test_df, \
           user_id_lst, busi_id_lst

In [4]:
train_sparse_matrix, valid_sparse_matrix, train_valid_sparse_matrix, test_sparse_matrix, \
           train_df, valid_df, train_valid_df, test_df, \
           user_id_lst, busi_id_lst = train_valid_test_split(data=data, m=1, n=1)

25420 users
4312 business
Train_rating_matrix Sparsity: 99.8088%
Valid_rating_matrix Sparsity: 99.9771%
Test_rating_matrix Sparsity:  99.9771%


In [5]:
np.save('train_sparse_matrix.npy', train_sparse_matrix)
np.save('valid_sparse_matrix.npy', valid_sparse_matrix)
np.save('test_sparse_matrix.npy', test_sparse_matrix)
np.save('train_valid_sparse_matrix.npy', train_valid_sparse_matrix)

np.save('user_id_lst.npy', user_id_lst)
np.save('busi_id_lst.npy', busi_id_lst)

train_df.to_pickle('train_df.pkl')
valid_df.to_pickle('valid_df.pkl')
test_df.to_pickle('test_df.pkl')
train_valid_df.to_pickle('train_valid_df.pkl')

In [6]:
test_sparse_matrix.shape

(25420, 4312)

In [7]:
train_sparse_matrix.shape

(25420, 4312)

In [8]:
train_valid_sparse_matrix.shape

(25420, 4312)