## Import The necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import LogNorm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

import seaborn as sns
sns.set_style()
sns.set_context("talk")
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os 

In [3]:
current_dir = os.getcwd()

## Read The Data Files In

### Review Data

In [5]:
review_path = current_dir + '\\review_and_business_data_cleaned_withdate.csv'

In [8]:
review_data = pd.read_csv(review_path, usecols = ['user_id', 'business_id', 'stars', 'date', 'text'])

In [9]:
review_data.head()

Unnamed: 0,business_id,stars,user_id,text,date
0,oiAlXZPIFm2nBCt0DHLu_Q,3.5,IMguz1Z9dp8HG0UfeLEdEg,I've been coming to this dry cleaner for almos...,2016-06-23 05:50:54
1,oiAlXZPIFm2nBCt0DHLu_Q,3.5,_TAVpa1Y2_5KZ5wWYeX_6g,They lost 2 pairs of my suitpants and told me ...,2011-10-27 23:35:10
2,oiAlXZPIFm2nBCt0DHLu_Q,3.5,45R6BBybzwDuJaL08d1myQ,I have been going to this dry cleaning since I...,2013-07-13 03:25:04
3,oiAlXZPIFm2nBCt0DHLu_Q,3.5,lYvUtZWr1gGv4vlwNcJXDQ,The staff is very nice and friendly. However ...,2016-05-07 16:01:55
4,oiAlXZPIFm2nBCt0DHLu_Q,3.5,b0eCjnqua4C3f0OLDhZHxA,DO NOT GO HERE. They are great until there i...,2013-01-28 10:40:43


In [11]:
review_data.shape

(679950, 5)

### Business data

In [17]:
business_path = current_dir + '\\yelp_dataset\\yelp_academic_dataset_business.json'

In [24]:
business_data = pd.read_json(business_path, lines = True)

In [25]:
business_data.drop(['attributes', 'hours', 'postal_code'], axis = 1, inplace = True)

In [26]:
business_data.head()

Unnamed: 0,business_id,name,address,city,state,latitude,longitude,stars,review_count,is_open,categories
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,35.462724,-80.852612,3.5,36,1,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,33.569404,-111.890264,5.0,4,1,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,45.479984,-73.58007,5.0,5,1,"Pets, Pet Services, Pet Groomers"
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,36.219728,-115.127725,2.5,3,0,"Hardware Stores, Home Services, Building Suppl..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,33.428065,-111.726648,4.5,26,1,"Home Services, Plumbing, Electricians, Handyma..."


In [32]:
business_data.shape

(209393, 11)

### User data

In [27]:
user_path = current_dir + '\\unique_user_df.csv'

In [28]:
user_data = pd.read_csv(user_path)

In [30]:
user_data = user_data[['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'fans', 'average_stars']]

In [31]:
user_data.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,fans,average_stars
0,ntlvfPzc8eglqvk92iDIAw,Rafael,553,2007-07-06 03:27:11,628,225,227,,14,3.57
1,FOBRPlBHa3WPHFB5qYDlVg,Michelle,564,2008-04-28 01:29:25,790,316,400,200820092010201120122013,27,3.84
2,zZUnPeh2hEp0WydbAZEOOg,Martin,60,2008-08-28 23:40:05,151,125,103,2010,5,3.44
3,QaELAmRcDc5TfJEylaaP8g,John,206,2008-09-20 00:08:14,233,160,84,2009,6,3.08
4,f4_MRNHvN-yRn7EA8YWRxg,Jennifer,822,2011-01-17 00:18:23,4127,2446,2878,20112012201320142015201620172018,137,3.63


In [38]:
user_data.shape

(285257, 10)

# Pre Processing Data

Selecting only restaurants and users from Las Vegas to limit the scope of the project

In [65]:
review_data.dropna(axis = 0, inplace = True)

In [66]:
review_data.shape

(679947, 5)

In [67]:
review_data.isna().any()

business_id    False
stars          False
user_id        False
text           False
date           False
dtype: bool

In [69]:
ratings_user_date = review_data.loc[:, ['user_id', 'date']]

In [70]:
ratings_user_date.date = pd.to_datetime(ratings_user_date.date)

In [73]:
index_holdout = ratings_user_date.groupby(['user_id'], sort=False)['date'].transform(max) == ratings_user_date['date']

In [75]:
ratings_holdout_ = review_data[index_holdout]
ratings_traincv_ = review_data[~index_holdout]

In [79]:
ratings_user_date = ratings_traincv_.loc[:, ['user_id', 'date']]
index_holdout = ratings_user_date.groupby(['user_id'], sort=False)['date'].transform(max) == ratings_user_date['date']
ratings_cv_ = ratings_traincv_[index_holdout]
ratings_train_ = ratings_traincv_[~index_holdout]

In [81]:
def process(df):
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(user_data, on = 'user_id')
    df = df.merge(business_data, on = 'business_id')
    rename_dict = {'business_longitude': 'longitude', 'business_latitude': 'latitude',
                  'business_state':'state','business_city':'city', 'business_address': 'address'}
    df = df.rename(columns = rename_dict)
    return df

ratings_train = process(ratings_train_.copy())
ratings_holdout = process(ratings_holdout_.copy())
ratings_val = process(ratings_cv_.copy())

In [85]:
ratings_train_final = ratings_train.append(ratings_val)
ratings_entire_df = ratings_train.append(ratings_val).append(ratings_holdout)

In [109]:
rating_above_avg = ratings_holdout[(ratings_holdout['city'] ==city) &
                          (ratings_holdout['stars_x'] >ratings_holdout['average_stars'])]

In [111]:
predict_df = rating_above_avg[['user_id','city','state']]

In [112]:
predict_df['predictions'] = 2.5

### Baseline Prediction

In [114]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import BaselineOnly

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [116]:
def process(df):
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(user_data, on = 'user_id')
    df = df.merge(business_data, on = 'business_id')
    rename_dict = {'business_longitude': 'longitude', 'business_latitude': 'latitude',
              'business_state':'state','business_city':'city', 'business_address': 'address'}
    df = df.rename(columns = rename_dict)
    return df

ratings_train = process(ratings_train_.copy())
ratings_test = process(ratings_holdout_.copy())
ratings_val = process(ratings_cv_.copy())

In [117]:
ratings_test = ratings_test.loc[ratings_test.business_id.isin(ratings_train.business_id)]
ratings_val = ratings_val.loc[ratings_val.business_id.isin(ratings_train.business_id)]

In [120]:
ratings_train.columns

Index(['business_id', 'stars_x', 'user_id', 'text', 'date', 'week_day',
       'month', 'hour', 'name_x', 'review_count_x', 'yelping_since', 'useful',
       'funny', 'cool', 'elite', 'fans', 'average_stars', 'name_y', 'address',
       'city', 'state', 'latitude', 'longitude', 'stars_y', 'review_count_y',
       'is_open', 'categories'],
      dtype='object')

In [121]:
trainset = ratings_train.loc[:,['user_id', 'business_id', 'stars_x']]
trainset.columns = ['userID', 'itemID','rating']
valset = ratings_val.loc[:, ['user_id', 'business_id', 'stars_x']]
valset.columns = ['userID', 'itemID','rating']
testset = ratings_holdout.loc[:, ['user_id', 'business_id', 'stars_x']]
testset.columns = ['userID', 'itemID','rating']

In [122]:
reader = Reader(rating_scale = (0.0, 5.0))
train_data = Dataset.load_from_df(trainset[['userID','itemID','rating']], reader)
val_data = Dataset.load_from_df(valset[['userID','itemID','rating']], reader)
test_data = Dataset.load_from_df(testset[['userID','itemID','rating']], reader)

train_sr = train_data.build_full_trainset()
val_sr_before = val_data.build_full_trainset()
val_sr = val_sr_before.build_testset()
test_sr_before = test_data.build_full_trainset()
test_sr = test_sr_before.build_testset()

In [123]:
bsl_options = {'method': 'als', 'n_epochs':3}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.3567


0.356690901643785

In [124]:

bsl_options = {'method': 'als', 'n_epochs':5}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.3568


0.3567896083494529

In [125]:
bsl_options = {'method': 'als', 'n_epochs':9}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.3568


0.35679632003693174

In [127]:
len(predictions)

93348

In [1]:
predictions

NameError: name 'predictions' is not defined