In [1]:
import matplotlib.pyplot as plt

import igraph as ig
import networkx as nx
import pandas as pd
import numpy as np
import re
import swifter

from sklearn import preprocessing, model_selection
from sklearn.model_selection import cross_validate
import lightgbm as lgbm

from settings import file_names

# Datasets

In [2]:
df_reviews = pd.read_csv(file_names['toronto_reviews_without_text'])
df_users = pd.read_csv(file_names['toronto_users'])
df_biz = pd.read_csv(file_names['toronto_businesses'])

In [3]:
df_tips = pd.read_json(file_names['tip'], encoding='utf-8', lines=True)
df_tips = df_tips.loc[df_tips['user_id'].isin(df_users.user_id.unique()) & df_tips['business_id'].isin(df_biz.business_id.unique())]
print('Number of Tips: ', df_tips.shape[0])
df_tips.head(3)

Number of Tips:  33192


Unnamed: 0,user_id,business_id,text,date,compliment_count
7,CaFbX7topxdLpBJ_SNDfGQ,r49iBfbnfoK7yt4rdsL_7g,Hands down best AYCE in town! Love it!,2016-01-05 06:39:33,0
10,8zXgNCkusOHMwh5Tj_8yCQ,qrSsS0pk7SL67MP5nN8tlg,More parking in the back of the restaurant,2013-12-22 05:10:12,0
13,oIHhfloe5VQamDFDB2dr1Q,wZhB4U003eg2GQ-F7D2D4A,When you receive the bag of seafood tie it up ...,2016-08-12 22:18:07,0


In [4]:
print('Number of reviews: ',df_reviews.shape[0])
print('Ratings describe: ', df_reviews['rating'].describe())
df_reviews.head(3)



Number of reviews:  228816
Ratings describe:  count    228816.000000
mean          3.643093
std           1.139687
min           1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


Unnamed: 0,user_id,business_id,rating,date
0,TpyOT5E16YASd7EWjLQlrw,AakkkTuGZA2KBodKi2_u8A,1.0,2012-07-16 00:37:14
1,_N7Ndn29bpll_961oPeEfw,y-Iw6dZflNix4BdwIyTNGA,3.0,2014-06-27 21:19:23
2,54kpqrxF9DEPpwa51hO_Bw,jzveTy7ogH7cg9axZ78ENg,4.0,2015-04-01 13:31:35


In [5]:
print('Number of users: ',df_users.shape[0])
df_users.head(3)

Number of users:  18804


Unnamed: 0,user_id,friends
0,gvXtMj3XuPr0xHjgmlmtng,"CfGCj80EdA-xS-mTWlAn4Q, JgD2Rk9K07MkZgG7Nb9YzA..."
1,pU6GoRTcl1rIOi6zMynjog,"SwK2Oo8sjOH0yXYVoyjJwQ, 1LpChUg-0EqRKRKzL4ogYA..."
2,AsYMm_O4H-mwmWbmeACDHw,"w-bVY-dhpvnNs8W3BTTgAA, o_OEO0ES7oywPqENQR0f3A..."


In [6]:
print('Number of businesses: ',df_biz.shape[0])
df_biz.head(3)

Number of businesses:  7965


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
0,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas..."
1,SP_YXIEwkFPPl_9anCYmpQ,The Steady Cafe & Bar,1051 Bloor Street W,Toronto,ON,M6H 1M4,43.660494,-79.432099,3.5,29,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Nightlife, Breakfast & Brunch, Ve..."
2,mlHC2XcU9Bows6cnYEmRgg,Mad Crush Wine Bar,582 College Street,Toronto,ON,M6G 1B3,43.65542,-79.413352,4.0,9,"{'Alcohol': ""u'full_bar'"", 'Caters': 'False', ...","Restaurants, Breakfast & Brunch, Bars, Modern ..."


# Collaborative Filtering - Rating Prediction

In [7]:
from surprise import SVD, SVDpp, CoClustering, SlopeOne, KNNBasic, KNNWithZScore

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate as surprise_cv, GridSearchCV as surprise_grid_search_cv, train_test_split as surprise_test_train_split

df_surprise = df_reviews[['business_id', 'user_id', 'rating']]
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_surprise, reader)
algo = SVD()

# surprise_cv(algo, data, measures=['RMSE', 'MAE', 'fcp'], cv=3, verbose=3, n_jobs=3)

In [8]:
trainset, testset = surprise_test_train_split(data, test_size=.25)
algo.fit(trainset)
predictions = algo.test(testset)
print(accuracy.fcp(predictions, verbose=True), accuracy.mae(predictions, verbose=True))

FCP:  0.5787
MAE:  0.8202
0.5787407032750097 0.8202047652764817


In [9]:
predictions[:2]

[Prediction(uid='aTvzqD5Kot4vp3vOnDyrTw', iid='FWvAdvaQx02yZKdvr8HpLQ', r_ui=3.0, est=4.296484557991658, details={'was_impossible': False}),
 Prediction(uid='t6FCO31z5uYrCM26jCaIaA', iid='TFxeEvpjMNQ3AWL49iMwtA', r_ui=4.0, est=3.973022838440857, details={'was_impossible': False})]

# Baseline Binary Prediction - Collaborative Filtering

#### Build input data for recommender:

In [10]:
from networkx.algorithms import bipartite
from graph import *

g = make_user_business_bipartite_graph(weighted=False, minimum_rating=0)

In [11]:
binary_data = bipartite.biadjacency_matrix(g, df_users.user_id.unique()).todense()
df_binary = pd.DataFrame(binary_data)
df_binary['user_id'] = df_binary.index.values
df_binary = pd.melt(df_binary, id_vars='user_id')
df_binary.rename({'variable':'business_id', 'value':'has_reviewed'}, axis='columns', inplace=True)
df_binary.head(3)

Unnamed: 0,user_id,business_id,has_reviewed
0,0,0,0
1,1,0,0
2,2,0,0


In [12]:
def negative_sampling(df, ratio_zeros_on_ones : float):
    '''
    creates a dataframe for training of recommender. 
    Only keeps a fractions of non-positive labels, as input matrix is extremely sparse.
    params: 
    df: pd.DataFrame containing user_id, business_id, and binary label has_reviewed
    '''
    df_zeros = df_binary.loc[df_binary['has_reviewed'] == 0]
    df_ones = df_binary.loc[df_binary['has_reviewed'] != 0]
    df_zeros = df_zeros.sample(n=int(df_ones.shape[0]*ratio_zeros_on_ones))
    return pd.concat([df_zeros, df_ones]).sample(frac=1).reset_index(drop=True)

#### Build, train and evaluate recommender:

In [13]:
reader = Reader(rating_scale=(0, 1))

data = negative_sampling(df_binary, 1)
print(data.shape)
data = Dataset.load_from_df(data, reader)

(441456, 3)


In [14]:
pd.read_csv('results_grid_search_3.csv')[['param_n_factors', 'param_n_epochs', 'param_init_mean', 'param_lr_all',
       'param_reg_all','mean_test_mae', 'mean_test_fcp', 'rank_test_mae']]

Unnamed: 0,param_n_factors,param_n_epochs,param_init_mean,param_lr_all,param_reg_all,mean_test_mae,mean_test_fcp,rank_test_mae
0,15,20,0.5,0.007,0.02,0.286353,0.70754,2
1,15,30,0.5,0.007,0.02,0.284136,0.707604,1
2,20,20,0.5,0.007,0.02,0.5,0.0,3
3,20,30,0.5,0.007,0.02,0.5,0.0,4
4,25,20,0.5,0.007,0.02,0.5,0.0,5
5,25,30,0.5,0.007,0.02,0.5,0.0,6


In [15]:
algo = SVDpp(n_factors=10, n_epochs=30, init_mean=0.5)

trainset, testset = surprise_test_train_split(data, test_size=.25)
algo.fit(trainset)

predictions = algo.test(testset)
print(accuracy.fcp(predictions, verbose=True), accuracy.mae(predictions, verbose=True))

FCP:  0.6951
MAE:  0.2814
0.6951166612710331 0.28144363315763515


In [16]:
from helpers import get_top_n

top_pred = get_top_n(predictions, n=1)
user_ids = list(top_pred.keys())
business_ids = [top_pred[0][0] for top_pred in list(top_pred.values())]

df_top_pred_test = pd.DataFrame({'user_id':user_ids, 'business_id':business_ids})

df_top_pred_test = pd.merge(df_top_pred_test,df_binary, how='left', on=['user_id','business_id'])
df_top_pred_test.head(3)

Unnamed: 0,user_id,business_id,has_reviewed
0,14879,5132,0
1,350,7689,1
2,2929,348,0


In [17]:
print('precision of top 1 recommender: {}'.format(df_top_pred_test['has_reviewed'].mean()))
print('Note that a random recommender would have a precision of : {}'.format(df_binary['has_reviewed'].mean()))

precision of top 1 recommender: 0.5841525332181052
Note that a random recommender would have a precision of : 0.0014938093079409033


# Binary Prediction - Using Graph Analysis
- Community detection
    - User --[Friends]-->User
    - User --[Reviewed]-->Restaurant
    - User --[Friends]-->User & User --[Reviewed]-->Restaurant
    - User --[distance with reviewed restaurant using geolocalisation]-->Reastaurant
- Centrality measures on restaurants


# Friends Graph Clustering

In [18]:
from graph import *

g_friends = make_friends_graph()

In [28]:
import itertools
from networkx.algorithms.community.centrality import girvan_newman
gn_friends = girvan_newman(g_friends)

In [29]:
k = 10
partitions_gn = {}
limited = itertools.takewhile(lambda c: len(c) <= k, gn_friends)
for communities in limited:
    partitions_gn[k] = tuple(sorted(c) for c in communities)

KeyboardInterrupt: 

In [None]:
pd.DataFrame(partitions_gn).to_csv('friends_partitions_girven_newman.csv')

In [None]:
for k in exp(range(5)):
    print k 
k = 10
limited = itertools.takewhile(lambda c: len(c) <= k, partitions_gn)

x = np.array()
y = np.array()
for part in sorted(partitions_gn[10], key = len, reverse=True):
    x.append(i)
    y.append(len(part))
    
y = y / sum(y)
    
plt.scatter(x=x, y=y)
plt.show()

In [None]:
from graph import *
g_friends = make_friends_graph()
g_bipartite = make_user_business_bipartite_graph(weighted=False, minimum_rating=4)
g_full = make_frienships_and_reviews_graph(weight_ratio=1, minimum_rating=0)

In [None]:

for i, part in enumerate(sorted([partition for partition in partitions_gn][:10],reverse=True, key=len)):


In [None]:
users = []
communities = []
for i, part in ennumerate(sorted(partitions_gn[10],reverse=True, key=len)):
    for user in part:
        users.append(user)
        communities.append(i)
        



### Small focus on coordinates

In [None]:
coordinates = df_biz[['latitude', 'longitude']]
coordinates.plot.scatter('latitude','longitude')

In [None]:
from sklearn.cluster import KMeans

clf = KMeans(n_clusters=10)
clf.fit(coordinates)
coordinates_clusters = clf.labels_
plt.scatter(x = coordinates['latitude'], y = coordinates['longitude'], c=coordinates_clusters)