In [35]:
import numpy as np
import pandas as pd
import os
import torch

from pytorch_widedeep.preprocessing import WidePreprocessor, DeepPreprocessor, TextPreprocessor, ImagePreprocessor
from pytorch_widedeep.models import Wide, DeepDense, DeepText, DeepImage, WideDeep
from pytorch_widedeep.initializers import *
from pytorch_widedeep.callbacks import *
from pytorch_widedeep.optim import RAdam

In [36]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm
from sklearn.preprocessing import normalize
from scipy import sparse
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

In [37]:
pd.set_option('display.max_columns', None)

In [38]:
dataset = pd.read_csv("../data/merged_data.csv")

In [39]:
dataset_small=pd.read_csv("../data/merged_data_small.csv")
dataset=dataset[dataset_small.columns]

In [40]:
test_dataset = dataset[dataset.groupby('user_id')['date'].transform('max') == dataset['date']].reset_index(drop=True)
print("test dataset created")
train_dataset = pd.concat([dataset, test_dataset]).drop_duplicates(keep=False).reset_index(drop=True)
print("train dataset created")

test dataset created
train dataset created


In [41]:
already_dummies=list(train_dataset.columns)[34:-11]+list(train_dataset.columns)[-10:]

In [42]:
wide_cols=['is_open']+already_dummies

In [43]:
target_col = 'rating'

In [44]:
target = train_dataset[target_col].values

In [45]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide_train = wide_preprocessor.fit_transform(train_dataset)
X_wide_test = wide_preprocessor.transform(test_dataset)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [46]:
cat_embed_cols = [(c, 16) for c in train_dataset.columns if 'city' in c]

In [47]:
continuous_cols = ['useful',
 'funny',
 'cool',
 'fans',
 'avg_user_rating',
 'compliment_hot',
 'compliment_more',
 'compliment_profile',
 'compliment_cute',
 'compliment_list',
 'compliment_note',
 'compliment_plain',
 'compliment_funny',
 'compliment_writer',
 'compliment_photos',
 'years_elite',
'review_cnt_x','review_cnt_y']

In [48]:
deep_preprocessor = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
X_deep_train = deep_preprocessor.fit_transform(train_dataset)
X_deep_test = deep_preprocessor.fit_transform(test_dataset) 

In [49]:
def deep_function(list1,list2):

    # Linear model
    wide = Wide(wide_dim=X_wide_train.shape[1], output_dim=1)
    # DeepDense: 2 Dense layers
    deepdense = DeepDense(hidden_layers=list1, dropout=list2,batchnorm=True, 
                      deep_column_idx=deep_preprocessor.deep_column_idx,
                      embed_input=deep_preprocessor.embeddings_input,
                      continuous_cols=continuous_cols)
    
    model = WideDeep(wide=wide, deepdense=deepdense)
    model.compile(method='regression')
    model.fit(X_wide=X_wide_train, X_deep=X_deep_train,target=target, n_epochs=5, batch_size=64, val_split=0.3)
    
    return(model)

In [50]:
model_1=deep_function([32,32],[0.5,0.5])
model_2=deep_function([32,64],[0.5,0.5])
model_3=deep_function([64,64],[0.5,0.5])
model_4=deep_function([64,128],[0.5,0.5])
model_5=deep_function([32,32,32],[0.5,0.5,0.5])

  0%|          | 0/4376 [00:00<?, ?it/s]

Training


epoch 1: 100%|██████████| 4376/4376 [00:51<00:00, 84.43it/s, loss=1.21]
valid: 100%|██████████| 1876/1876 [00:17<00:00, 106.23it/s, loss=1.18]
epoch 2: 100%|██████████| 4376/4376 [00:50<00:00, 86.25it/s, loss=1.18] 
valid: 100%|██████████| 1876/1876 [00:13<00:00, 135.49it/s, loss=1.18]
epoch 3: 100%|██████████| 4376/4376 [00:47<00:00, 92.42it/s, loss=1.18] 
valid: 100%|██████████| 1876/1876 [00:13<00:00, 137.56it/s, loss=1.18]
epoch 4: 100%|██████████| 4376/4376 [00:47<00:00, 113.12it/s, loss=1.18]
valid: 100%|██████████| 1876/1876 [00:13<00:00, 148.45it/s, loss=1.18]
epoch 5: 100%|██████████| 4376/4376 [00:47<00:00, 92.47it/s, loss=1.18] 
valid: 100%|██████████| 1876/1876 [00:13<00:00, 139.64it/s, loss=1.18]


In [51]:
def rmse_function(model):
    y_pred=model.predict(X_wide=X_wide_test, X_deep=X_deep_test)
    rmse = sqrt(mean_squared_error(test_dataset[target_col].values, y_pred))
    return(rmse)

In [None]:
l=[]
l.append(rmse_function(model_1))
l.append(rmse_function(model_2))
l.append(rmse_function(model_3))
l.append(rmse_function(model_4))
l.append(rmse_function(model_5))

In [None]:
l2=['32 x 32','32 x 64','64 x 64','64 x 128','32 x 32 x 32']

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.plot(l2,l)
plt.title("Plot of RMSE vs Number of Neurons in each hidden layer")
plt.xlabel("No of Neurons")
plt.ylabel("RMSE")
plt.savefig('../output/' +'rmse_hidden_layers.png')

In [None]:
def mae_function(model):
    y_pred=model.predict(X_wide=X_wide_test, X_deep=X_deep_test)
    mae = sqrt(mean_absolute_error(test_dataset[target_col].values, y_pred))
    return(mae)

In [None]:
l=[]
l.append(mae_function(model_1))
l.append(mae_function(model_2))
l.append(mae_function(model_3))
l.append(mae_function(model_4))
l.append(mae_function(model_5))

In [None]:
plt.plot(l2,l)
plt.title("Plot of Mae vs Number of Neurons in each hidden layer")
plt.xlabel("No of Neurons")
plt.ylabel("MAE")
plt.savefig('../output/' +'mae_hidden_layers.png')

In [52]:
user_counts=dataset['user_id'].value_counts()
less_prolific_users = user_counts.loc[user_counts <= 5].index.tolist()
test_data_less_prolific=test_dataset[(test_dataset.user_id.isin(less_prolific_users))]

In [53]:
X_wide_test_less_prolific = wide_preprocessor.transform(test_data_less_prolific)
X_deep_test_less_prolific = deep_preprocessor.fit_transform(test_data_less_prolific) 

In [54]:
y_pred=model_5.predict(X_wide=X_wide_test_less_prolific, X_deep=X_deep_test_less_prolific)
rmse = sqrt(mean_squared_error(test_data_less_prolific[target_col].values, y_pred))
print("Root-mean-square error for less prolific users = " + str(rmse))

predict: 100%|██████████| 143/143 [00:00<00:00, 192.60it/s]


Root-mean-square error for less prolific users = 1.2515044661420935


In [55]:
business_counts=dataset['business_id'].value_counts()
less_popular_business = business_counts.loc[business_counts <= 100].index.tolist()
test_data_less_popular=test_dataset[(test_dataset.business_id.isin(less_popular_business))]

In [56]:
X_wide_test_less_popular = wide_preprocessor.transform(test_data_less_popular)
X_deep_test_less_popular = deep_preprocessor.fit_transform(test_data_less_popular) 

In [57]:
y_pred=model_5.predict(X_wide=X_wide_test_less_popular, X_deep=X_deep_test_less_popular)
rmse = sqrt(mean_squared_error(test_data_less_popular[target_col].values, y_pred))
print("Root-mean-square error for less popular business = " + str(rmse))

predict: 100%|██████████| 165/165 [00:00<00:00, 211.28it/s]


Root-mean-square error for less popular business = 1.371823946487629


In [70]:
user_data_1=dataset[['user_id','useful',
 'funny',
 'cool',
 'fans',
 'avg_user_rating',
 'compliment_hot',
 'compliment_more',
 'compliment_profile',
 'compliment_cute',
 'compliment_list',
 'compliment_note',
 'compliment_plain',
 'compliment_funny',
 'compliment_writer',
 'compliment_photos',
 'years_elite',
'review_cnt_y']]


user_data_1[user_data_1['user_id']=='3CJUJILq7CLHk_9OrvpvQg']
user_data_1=user_data_1.drop_duplicates(subset=None, keep='first', inplace=False)

business_data_1=dataset[['business_id']+['review_cnt_x']+['city']+['name']+['categories']+wide_cols]
business_data_1=business_data_1.drop_duplicates(subset=None, keep='first', inplace=False)

from itertools import product
combination=list(product(dataset['user_id'].unique()[1:1000], dataset['business_id'].unique()[1:500]))

df_pred_1=pd.DataFrame(data=combination,columns=['user_id','business_id'])


df_pred_1=df_pred_1.merge(user_data_1,how='left',on='user_id')

df_pred_1=df_pred_1.merge(business_data_1,how='left',on='business_id')

X_wide_pred = wide_preprocessor.transform(df_pred_1)
X_deep_pred = deep_preprocessor.fit_transform(df_pred_1) 

y_pred_1=model_5.predict(X_wide=X_wide_pred, X_deep=X_deep_pred)

df_pred_1['predicted_rating']=y_pred_1

predict: 100%|██████████| 7790/7790 [00:35<00:00, 234.49it/s]


In [71]:
df_pred_1=df_pred_1.sort_values(['user_id', 'predicted_rating'], ascending=[True, False])

df_pred_1['RN'] = df_pred_1.sort_values(['user_id','predicted_rating'], ascending=[True,False]) \
             .groupby(['user_id']) \
             .cumcount() + 1

df_pred_2=df_pred_1.loc[df_pred_1['RN'].isin(range(1,11))].reset_index(drop=True)

#User Coverage
def user_coverage(data,k,threshold):
    sum1=0
    l1=[]
    c=0
    for i in range(0,df_pred_2.shape[0]):
        if(df_pred_2['predicted_rating'][i] > 3.5):
            l1.append(1)
            c+=1
        else:
            l1.append(0)
            c+=1
        if(c == 10):
            if (np.sum(l1) > 5):
                sum1+=1
            c=0
            l1=[]
    user_coverage=sum1/df_pred_2['user_id'].nunique()*100
    return(user_coverage)
coverage_1_test=user_coverage(df_pred_2,5,3.5)
print('User coverage on test set:',coverage_1_test)


#Catalogue Coverage
def catalogue_coverage(predicted, catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return prediction_coverage
coverage_2=catalogue_coverage(list(df_pred_2['business_id']),list(df_pred_1['business_id'].unique()))
print('Catalogue coverage on test set:',coverage_2)

User coverage on test set: 96.996996996997
Catalogue coverage on test set: 12.83


In [72]:
df_pred_3=df_pred_2.merge(business_data_1[['business_id','name']],how='left',on='business_id')

In [74]:
df_pred_3[['user_id','name_y','categories']].head(10)

Unnamed: 0,user_id,name_y,categories
0,-6sK3CL1g1OP1FMawX2hxA,Fountains of Bellagio,"Public Services & Government, Restaurants, Per..."
1,-6sK3CL1g1OP1FMawX2hxA,Blue Ribbon Brasserie - Las Vegas,"Cocktail Bars, Restaurants, Seafood, Comfort F..."
2,-6sK3CL1g1OP1FMawX2hxA,Eatt Gourmet Bistro,"Fast Food, Restaurants, Sandwiches, Bakeries, ..."
3,-6sK3CL1g1OP1FMawX2hxA,The Corndog Company LV,"Food Trucks, Street Vendors, Food, Hot Dogs, R..."
4,-6sK3CL1g1OP1FMawX2hxA,The Venetian Las Vegas,"Shopping Centers, Resorts, Arts & Entertainmen..."
5,-6sK3CL1g1OP1FMawX2hxA,Azuza Hookah Lounge & Cafe,"Beer, Wine & Spirits, Bars, Ethnic Food, Cafes..."
6,-6sK3CL1g1OP1FMawX2hxA,Tina's Gourmet Sausage House,"Specialty Food, Butcher, Delis, International ..."
7,-6sK3CL1g1OP1FMawX2hxA,Pinball Hall Of Fame,"Performing Arts, Amusement Parks, Museums, Arc..."
8,-6sK3CL1g1OP1FMawX2hxA,The Steakhouse at Treasures,"Cocktail Bars, Food, Wineries, Nightlife, Rest..."
9,-6sK3CL1g1OP1FMawX2hxA,Estiatorio Milos,"Seafood, Greek, Restaurants"


In [77]:
dataset[['user_id','name','categories']].loc[dataset['user_id']=='-6sK3CL1g1OP1FMawX2hxA']

Unnamed: 0,user_id,name,categories
412,-6sK3CL1g1OP1FMawX2hxA,Capriotti's Sandwich Shop,"Restaurants, Delis, Sandwiches"
45976,-6sK3CL1g1OP1FMawX2hxA,Wicked Spoon,"Buffets, Breakfast & Brunch, Restaurants"
167083,-6sK3CL1g1OP1FMawX2hxA,Taco Bell,"Food, Restaurants, Tex-Mex, Mexican, Fast Food"
224919,-6sK3CL1g1OP1FMawX2hxA,Lotus of Siam,"Car Dealers, Nightlife, Automotive, Buffets, W..."
338517,-6sK3CL1g1OP1FMawX2hxA,Delhi Indian Cuisine,"Halal, Restaurants, Buffets, Food, Indian, Foo..."
363125,-6sK3CL1g1OP1FMawX2hxA,Pho Vegas,"Vietnamese, Soup, Restaurants"
404365,-6sK3CL1g1OP1FMawX2hxA,The Buffet,"Beauty & Spas, Food, Event Planning & Services..."
418877,-6sK3CL1g1OP1FMawX2hxA,Mr Sandwich,"Juice Bars & Smoothies, Sandwiches, Vietnamese..."


In [79]:
df_pred_3[['user_id','name_y','categories']].head(10).to_csv('../output/recommended.csv')
dataset[['user_id','name','categories']].loc[dataset['user_id']=='-6sK3CL1g1OP1FMawX2hxA'].to_csv('../output/actual_visited.csv')