### Import Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import geopy
import folium

#plotting
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np
import seaborn as sns
%matplotlib inline

### Loading Data

In [2]:
business = "Yelp_Updated_Data/business.json"
checkin = "Yelp_Updated_Data/checkin.json"
review = "Yelp_Updated_Data/review.json"
tip = "Yelp_Updated_Data/tip.json"
user = "Yelp_Updated_Data/user.json"
business_df = pd.read_json(business,lines=True)
checkin_df = pd.read_json(checkin,lines=True)
review_df = pd.read_json(review,lines=True)
tip_df = pd.read_json(tip,lines=True)
user_df = pd.read_json(user,lines=True)

In [3]:
#business_df.shape
x = business_df[['city','business_id']].groupby('city').count().reset_index()

#business_df('business_id',ascending=False)
x.sort_values('business_id',ascending=False).head(10)

Unnamed: 0,city,business_id
11,Las Vegas,585
20,Phoenix,121
22,Scottsdale,80
25,Tempe,34
26,Toronto,32
9,Henderson,29
5,Charlotte,26
21,Pittsburgh,19
7,Gilbert,14
6,Cleveland,12


### Training and Testing Split - 2016

In [4]:
city_df = pd.DataFrame()
city_df = business_df[['business_id','city']]
city_df = city_df[(city_df['city']=='Phoenix')]
city_df = city_df.drop(['city'],axis=1).drop_duplicates()
city_review = pd.merge(review_df,city_df, on='business_id',how="inner")
review_df = city_review


split_date =  np.datetime64('2016-01-01')
test_review = review_df[(review_df['date']>split_date)]
train_review = review_df[(review_df['date']<=split_date)]

test_user_id = pd.DataFrame()
train_user_id = pd.DataFrame()
test_business_id = pd.DataFrame()
train_business_id = pd.DataFrame()

test_user_id['user_id'] = test_review['user_id'].drop_duplicates()
train_user_id['user_id'] = train_review['user_id'].drop_duplicates()

test_business_id['business_id'] = test_review['business_id'].drop_duplicates()
train_business_id['business_id'] =  train_review['business_id'].drop_duplicates()

train_business = pd.merge(train_business_id,business_df, on='business_id',how="inner")
test_business = pd.merge(test_business_id,business_df, on='business_id',how="inner")

train_user = pd.merge(train_user_id,user_df, on='user_id',how="inner")
test_user = pd.merge(test_user_id,user_df, on='user_id',how="inner")



#using just training data
user_df = train_user
business_df = train_business
review_df = train_review

### Adding Yelp Age of every user

In [5]:
user_df['years'] = (pd.to_datetime('now') - pd.to_datetime(user_df['yelping_since'])) / np.timedelta64(1, 'Y')
user_df.head(2)

Unnamed: 0,user_id,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,...,cool,elite,fans,friends,funny,name,review_count,useful,yelping_since,years
0,8EtDC-WjdnjlBIbDugaC7w,3.0,0,0,0,0,0,0,0,0,...,0,,0,vh36RIa20MgpJZ_0W6eUWg,0,Jarom,13,2,2010-12-03,8.000054
1,f4GXxWC8rJRedtaBGSuy3A,4.0,0,0,0,0,0,0,0,0,...,0,,0,,0,Sharyl,1,0,2010-09-19,8.205398


### Adding Average Review Length of every user

In [7]:
#avg review length
def review_len(x):
    return len(x.split(' '))
review_df['review_len'] = review_df['text'].apply(review_len)
user_id_review = review_df[['review_len','user_id']].groupby('user_id').mean()

#join
user_df = pd.merge(user_df, user_id_review, on='user_id',how="inner")
user_df = user_df.rename(columns={'review_len': 'avg_review_len'})
user_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,user_id,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,...,fans,friends,funny,name,review_count,useful,yelping_since,years,avg_review_len,avg_review_len.1
0,8EtDC-WjdnjlBIbDugaC7w,3.0,0,0,0,0,0,0,0,0,...,0,vh36RIa20MgpJZ_0W6eUWg,0,Jarom,13,2,2010-12-03,8.000054,21.0,21.0
1,f4GXxWC8rJRedtaBGSuy3A,4.0,0,0,0,0,0,0,0,0,...,0,,0,Sharyl,1,0,2010-09-19,8.205398,58.0,58.0
2,d85a_py0ByqPWmRF_9Fgxg,3.67,0,0,0,0,0,0,0,0,...,0,,0,Jackie,3,0,2013-03-09,5.735805,40.0,40.0
3,-sSOPoIMX91uMm3Il6SGMQ,4.27,1,0,1,0,0,0,0,0,...,2,"MOM_bHxS7wmNE2hy_jgGVQ, eU77PxGTK58WVRWoXULlEw...",0,Dave,20,19,2012-12-11,5.976741,64.666667,64.666667
4,RaCSpqJUFU9GeAChmONY4w,3.25,0,0,0,0,0,0,0,0,...,0,"TFAoEidDa-_u0XvJEfb3hg, Rpmeia2w6E_JHMTdh4eHlg...",0,Shelly,4,0,2010-08-17,8.295748,249.0,249.0


### Adding Number of Friends of every user

In [8]:
def count_friends(x):
    return len(x.split(","))
user_df['num_friends'] = user_df['friends'].apply(count_friends)
user_df.head(10)

Unnamed: 0,user_id,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,...,friends,funny,name,review_count,useful,yelping_since,years,avg_review_len,avg_review_len.1,num_friends
0,8EtDC-WjdnjlBIbDugaC7w,3.0,0,0,0,0,0,0,0,0,...,vh36RIa20MgpJZ_0W6eUWg,0,Jarom,13,2,2010-12-03,8.000054,21.0,21.0,1
1,f4GXxWC8rJRedtaBGSuy3A,4.0,0,0,0,0,0,0,0,0,...,,0,Sharyl,1,0,2010-09-19,8.205398,58.0,58.0,1
2,d85a_py0ByqPWmRF_9Fgxg,3.67,0,0,0,0,0,0,0,0,...,,0,Jackie,3,0,2013-03-09,5.735805,40.0,40.0,1
3,-sSOPoIMX91uMm3Il6SGMQ,4.27,1,0,1,0,0,0,0,0,...,"MOM_bHxS7wmNE2hy_jgGVQ, eU77PxGTK58WVRWoXULlEw...",0,Dave,20,19,2012-12-11,5.976741,64.666667,64.666667,20
4,RaCSpqJUFU9GeAChmONY4w,3.25,0,0,0,0,0,0,0,0,...,"TFAoEidDa-_u0XvJEfb3hg, Rpmeia2w6E_JHMTdh4eHlg...",0,Shelly,4,0,2010-08-17,8.295748,249.0,249.0,6
5,0chzcy7PU1VLkDIU_MV4LQ,3.94,0,0,0,0,0,0,0,0,...,8VYOB9BAHnHB8HmhDB0QEQ,0,grace,13,2,2011-04-02,7.671506,20.0,20.0,1
6,EILgTvvIAquMcIFzjAtWbQ,5.0,0,0,0,0,0,0,1,0,...,"wMRkYiG_kDg243-7pMOgRg, Qsl1QiEs24U2NS8K7iNoNA...",1,Daniel,5,11,2008-11-20,10.034319,69.0,69.0,35
7,OYGNSKITvfg5zhwXfzjZPQ,4.5,0,0,0,0,0,0,0,0,...,,0,Travis,2,0,2014-10-12,4.142344,23.0,23.0,1
8,8XKJ2LLwUySN_eAi_9u6TA,5.0,0,0,0,0,0,0,0,0,...,,0,Marcus,2,0,2012-07-27,6.351834,26.0,26.0,1
9,2nk5zUt-mSxkQhBTGcMkeQ,4.24,0,0,0,0,0,0,1,0,...,"5517Yj-6hxE2IZl0xbRYYA, u9IyeSlfU9SwdWOymnFbbA...",3,Shivaas,17,3,2012-06-11,6.477778,126.0,126.0,5


### (Business, Category) Sets

In [10]:
category_explode = business_df[['categories','business_id']]
#category_explode['categories']=category_explode['categories'].apply(lambda x:x.split(","))
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

categories_business_df =splitDataFrameList(category_explode,'categories',',')


In [11]:
#categories_business_df.head(10)

df_category_top = categories_business_df[['business_id','categories']].groupby('categories').count().reset_index()
df_category_top = df_category_top.rename(columns={'business_id': 'total_count_c'})
df_category_top = df_category_top.sort_values(by=['total_count_c'],ascending=False)
#df_category_top[df_category_top['categories'].str.contains('American', flags = re.IGNORECASE)]
df_category_top.head(20)

Unnamed: 0,categories,total_count_c
60,Restaurants,88
54,Nightlife,35
1,American (New),31
7,Bars,30
11,Breakfast & Brunch,23
116,Restaurants,22
31,Food,19
52,Mexican,18
62,Sandwiches,15
63,Seafood,13


### Fetching all elites of a specific category

In [12]:
#assumption: In each restaurants category list; we have non repeating categories.
#categories_groupby = categories_business_df.groupby("categories")
import re
def elite_split(x):
    if(x == 'None'):
        return 0
    return 1

def get_category_data(category,df):
    #filtered_df = pd.DataFrame()
    
    filtered_df = df[df['categories'].str.contains(category, flags = re.IGNORECASE)]
    
    restaurant_category = pd.DataFrame()
    restaurant_category['business_id'] = filtered_df['business_id'].drop_duplicates()
    review_category = pd.merge(restaurant_category,review_df, on='business_id',how="inner")
    
    user_category = pd.DataFrame()
    user_category['user_id'] = review_category['user_id'].drop_duplicates()
    #user_category = pd.DataFrame()
    user_category_table = pd.merge(user_category,user_df, on='user_id',how="inner")
    user_category_table = user_category_table[['user_id','years','num_friends','elite']]
    
    df_avg_review = review_category[['review_len','user_id']].groupby('user_id').mean().reset_index()
    df_avg_review = df_avg_review.rename(columns={'review_len': 'avg_review_len_c'})
    
    df_tot_review = review_category[['review_id','user_id']].groupby('user_id').count().reset_index()
    df_tot_review = df_tot_review.rename(columns={'review_id': 'total_count_c'})
    
    df_avg_stars = review_category[['stars','user_id']].groupby('user_id').mean().reset_index()
    df_avg_stars = df_avg_stars.rename(columns={'stars': 'avg_rating_c'})
    
    df_funny = review_category[['funny','user_id']].groupby('user_id').sum().reset_index()
    df_cool = review_category[['cool','user_id']].groupby('user_id').sum().reset_index()
    df_useful = review_category[['useful','user_id']].groupby('user_id').sum().reset_index()
    
    df_tot_business = review_category[['business_id','user_id']].groupby('user_id').count().reset_index()
    df_tot_business = df_tot_business.rename(columns={'business_id': 'total_business_c'})   

    user_c = [df_avg_review,df_avg_stars,df_tot_review,df_funny,df_cool,df_useful,df_tot_business]
    final_df = user_c[0]
    for df_ in user_c[1:]:
        final_df = final_df.merge(df_, on='user_id')
    #final_df['imp_votes'] = df.apply(lambda row: row.funny + row.cool + row.useful, axis=1)
    #final_df = final_df.drop(['funny', 'useful','cool'], axis=1)
    final_df = pd.merge(user_category_table,final_df, on='user_id',how="inner")
    
    final_df['elite'] = final_df['elite'].apply(elite_split)
    return final_df
    
    
    

### Focusing on Specific Category

In [14]:
category = "Mexican"
res = get_category_data(category,categories_business_df)   
res.head(10)

Unnamed: 0,user_id,years,num_friends,elite,avg_review_len_c,avg_rating_c,total_count_c,funny,cool,useful,total_business_c
0,fdkfe69RTWTYws9oEcjRAQ,4.36959,8,0,39.0,5.0,1,0,0,0,1
1,OyDTDmQpFbXP0wVxjeVZMQ,3.909621,15,0,25.0,5.0,1,1,1,1,1
2,X6FR50d8SjB2d0JLfP1XKQ,5.83437,1,0,46.0,4.0,1,0,0,0,1
3,5ocTtFOxV_Roiaa4abWQlQ,7.41688,27,0,113.0,5.0,1,0,0,1,1
4,9gR51SDAgU0Gn8vwKT6M0Q,3.216931,36,0,51.0,5.0,1,0,0,0,1
5,4EoWEbMcLdZN6ugen-MlbQ,8.925467,14,0,29.0,5.0,1,0,0,0,1
6,SL7eCwzWXGQZJPsC61uB6Q,6.225891,1,0,69.0,5.0,1,0,0,0,1
7,s8WFGpMSkp4-ftPVwd6Xow,4.659808,1,0,79.0,5.0,1,0,0,1,1
8,TMaBWSv75DEuXyZlGJj6cg,7.477114,10,0,69.75,5.0,4,0,1,2,4
9,-A0Fjt-z3RY6k4xYaiIfLw,6.425758,1,0,15.0,4.0,1,0,0,0,1


### Classify elite individuals of specific category

In [15]:
X = res[['years', 'num_friends', 'avg_review_len_c',
       'avg_rating_c', 'total_count_c', 'funny', 'cool', 'useful',
       'total_business_c']]
y=res['elite']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=12,class_weight="balanced")
c=clf.fit(X_train,y_train)
result = c.predict(X_test)
result
accuracy_score(result,y_test)

0.8229291716686674

### Fetching all elites

In [17]:
res = get_category_data(category,categories_business_df)   
category_elite = res[(res['elite']==1)]
category_elite_id = pd.DataFrame()
category_elite_id['user_id'] = category_elite['user_id']

#### Generating (user,restaurant) combinations by recommending all restuarants of elites to his friends

In [18]:
#yet to check positive reviews and within range location 
selected_users = pd.merge(category_elite_id,user_df, on='user_id',how="inner").drop_duplicates()
df= categories_business_df


filtered_df = df[df['categories'].str.contains(category, flags = re.IGNORECASE)]
restaurant_category = pd.DataFrame()
restaurant_category['business_id'] = filtered_df['business_id'].drop_duplicates()

review_category = pd.merge(restaurant_category,review_df, on='business_id',how="inner")  


selected_businesses = pd.merge(category_elite_id,review_category, on='user_id',how="inner")
selected_businesses[['user_id','business_id']]

  


import itertools 
final_set = set()
result = selected_businesses[['user_id','business_id']].groupby('user_id').agg({'business_id':lambda x: list(x)}).reset_index()
result
for index,row in result.iterrows():
    user_id = row['user_id']
    business_list = row['business_id']
    friend_list = user_df[(user_df['user_id']==user_id)].drop_duplicates().reset_index()['friends'][0].split(",")
    #print(user_id)
    #print(len(friend_list.split(",")))
    prod = itertools.product(business_list,friend_list)
    for tup in prod:
        final_set.add(tup)
    
    


### Removing combinations that appeared in Training

In [19]:
#final_set
df1 = pd.DataFrame(list(final_set), columns=['business_id', 'user_id']).drop_duplicates()
df2 = review_df[['business_id','user_id']].drop_duplicates()


result = pd.concat([df2,df2],axis=0)
result = pd.concat([result,df1],axis=0)
result = result.drop_duplicates(keep=False)
result




Unnamed: 0,business_id,user_id
0,JfHXzulF6yIKgA22YYPedw,tF0eV3uwRKDPvUZ52yumxQ
1,cHdJXLlKNWixBXpDwEGb_A,oOhl3jYCukuTI0d7Z8BqKw
2,7Q2cYSl5NBYytJzgjX0oKw,3NNo-cleFhrA_pRtKmKsDw
3,3C5Z9homtzkWHouH2BHXYQ,1iLlVGtL97sh6pviwAio5g
4,JfHXzulF6yIKgA22YYPedw,-eeWTHEBY27Whw6Yb6j_Ow
5,jsuUmIEefPjV__ads62Z5w,zYHMCzXkbORj_b7WhDtjGA
6,JfHXzulF6yIKgA22YYPedw,Q8quUdHYLQ0Nqho91Ebfng
7,89uU51kOiQXbJHVA3C6XMQ,crgWPc35LxjJccLFVQsQig
8,frCxZS7lPhEnQRJ3UY6m7A,zOxFYwqq8Iwhsqm_jm016A
9,jsuUmIEefPjV__ads62Z5w,GqzV1cMocEv-a0JkoyT30w


### Estimating User Location- Clustering all restaurants he has been to!

In [20]:
#location
ta = business_df[["business_id","latitude","longitude"]]
tb = review_df[["business_id","user_id"]]
user_location = pd.merge(ta,tb, on='business_id',how="inner")

user_loc = user_location[["user_id","latitude","longitude"]]
user_loc_map = dict()
for index,row in user_loc.iterrows():
    if row['user_id'] not in user_loc_map:
        user_loc_map[row['user_id']]=[]
        user_loc_map[row['user_id']].append([row['latitude'],row['longitude']])
    else:
        user_loc_map[row['user_id']].append([row['latitude'],row['longitude']])
        



In [21]:
from sklearn.cluster import KMeans
def get_location(X):
    if len(X)==1:
        n_c=1
    else:
        n_c=2
    kmeans = KMeans(n_clusters=n_c, random_state=0).fit(X)
    l=len(kmeans.labels_)
    ones=sum(kmeans.labels_)
    if ones > l/2:
        return kmeans.cluster_centers_[1]
    else:
        return kmeans.cluster_centers_[0]
   

answer= dict()
for key,value in user_loc_map.items():
        answer[key] = get_location(value)
    
#answer    


### Computing distance between user and restaurant

In [22]:
user_location = answer
user_restaurant_combination = result

#user_restaurant_combination
loc_list=[]
for key,value in user_location.items():
    list_part = []
    list_part.append(key)
    list_part.append(value[0])
    list_part.append(value[1])
    loc_list.append(list_part)
    

user_lat_long = pd.DataFrame(loc_list, columns=['user_id', 'u_lat','u_long']).drop_duplicates()
user_business_lat_long = pd.merge(user_lat_long,user_restaurant_combination, how="inner", on="user_id")
#user_business_lat_long

user_business_lat_long_ = pd.merge(user_business_lat_long,business_df[['business_id','latitude','longitude']], how="inner", on="business_id")
user_business_lat_long_

Unnamed: 0,user_id,u_lat,u_long,business_id,latitude,longitude
0,9w4vI3LRTA4JdJyzOxeEMw,33.522335,-112.071517,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
1,FtswtnMxahDOBwCFt1un0g,33.500363,-112.034208,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
2,UmXzwFM2OkZHWJ5cJZlqkg,33.505279,-112.068897,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
3,asL1d04xS3y8YQtU5ota-g,33.499068,-112.028573,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
4,V7B4KTBitlBs9YH_B00-9Q,33.481567,-112.073641,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
5,ctXFXk9-m6PCMoI-Nz2_XQ,33.485290,-112.054091,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
6,8SgjU2fID9VqKQooFt1DIQ,33.481994,-112.064377,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
7,1UwJVPsIiQ1VpzRZhLftSw,33.500508,-112.045968,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
8,N_IFBJrVN5CVRNi4UmKf1g,33.499424,-112.062931,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835
9,0mWRAT95QOSkc_PqKTURbg,33.489876,-112.051068,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835


### Selecting restaurants within 5 mile radius!

In [23]:
#17 minutes for food- 30 miles
import geopy.distance
res_list = []
for index,row in user_business_lat_long_.iterrows():
    user_loc = (row['u_lat'],row['u_long'])
    rest_loc = (row['latitude'],row['longitude'])
    dist = geopy.distance.vincenty(user_loc,rest_loc).miles
    part = []
    part.append(row['user_id'])
    part.append(row['business_id'])
    part.append(dist)
    res_list.append(part)

user_rest_dist = pd.DataFrame(res_list, columns=['user_id', 'business_id','dist'])
user_rest_dist = user_rest_dist[(user_rest_dist['dist']<=10)]
train_done = user_rest_dist[['user_id','business_id']]
train_done


Unnamed: 0,user_id,business_id
0,9w4vI3LRTA4JdJyzOxeEMw,Ns20WGWn6s6niKAGIQB4UQ
1,FtswtnMxahDOBwCFt1un0g,Ns20WGWn6s6niKAGIQB4UQ
2,UmXzwFM2OkZHWJ5cJZlqkg,Ns20WGWn6s6niKAGIQB4UQ
3,asL1d04xS3y8YQtU5ota-g,Ns20WGWn6s6niKAGIQB4UQ
4,V7B4KTBitlBs9YH_B00-9Q,Ns20WGWn6s6niKAGIQB4UQ
5,ctXFXk9-m6PCMoI-Nz2_XQ,Ns20WGWn6s6niKAGIQB4UQ
6,8SgjU2fID9VqKQooFt1DIQ,Ns20WGWn6s6niKAGIQB4UQ
7,1UwJVPsIiQ1VpzRZhLftSw,Ns20WGWn6s6niKAGIQB4UQ
8,N_IFBJrVN5CVRNi4UmKf1g,Ns20WGWn6s6niKAGIQB4UQ
9,0mWRAT95QOSkc_PqKTURbg,Ns20WGWn6s6niKAGIQB4UQ


### Assigning Scores to Pairings

In [24]:
result_1 =pd.merge(user_df[['user_id','num_friends','review_count']],train_done, how="inner",on='user_id')
result_2 =pd.merge(business_df[['business_id','review_count']],result_1, how="inner",on='business_id')
#user_df[['user_id','num_friends','review_count']]
#business_df[['business_id','review_count']]
result_2
result_2_list = []
for index,row in result_2.iterrows():
    res_part =[]
    res_part.append(row['business_id'])
    res_part.append(row['user_id'])
    res_part.append(0.5*row['review_count_x']+0.3*row['review_count_y']+0.2*row['num_friends'])
    #res_part.append(row['review_count_y'])
    result_2_list.append(res_part)

train_done_ranked = pd.DataFrame(result_2_list, columns=['business_id', 'user_id','score']).drop_duplicates()
check_test = train_done_ranked.sort_values('score',ascending=False)

### Comparing the list with the Test Set to find common entries!

In [25]:
final_prediction = pd.merge(test_review[['business_id','user_id']],check_test, how="inner", on=["business_id",'user_id'])
final_prediction

Unnamed: 0,business_id,user_id,score
0,7m1Oa1VYV98UUuo_6i0EZg,O_GWZZfQx7qv-n-CN7hsIA,1592.1
1,7m1Oa1VYV98UUuo_6i0EZg,8-elsQ1aqXXoOBBB6hzFsw,750.2
2,Wc9UpJhOcdSj7olZkz7SJA,MMf0LhEk5tGa1LvN7zcDnA,1020.1
3,jIzygnVmajEXYmfsBNY_Gw,ZcwLnR_VGfgLmuJ_7tnAjQ,618.0
4,eS29S_06lvsDW04wVrIVxg,PxzBS6FXePo9p__FRVLMSg,756.8
5,zJGtD3y-pAIGNId4codEEg,gKccjX4MhqW7hNHmmXWhxg,335.9
6,89uU51kOiQXbJHVA3C6XMQ,sTcYq6goD1Fa2WS9MSkSvQ,734.9
7,Ns20WGWn6s6niKAGIQB4UQ,8SgjU2fID9VqKQooFt1DIQ,870.8
8,Ns20WGWn6s6niKAGIQB4UQ,0Bm2BA66nUSJtgpOI9XBPQ,378.3
9,3C5Z9homtzkWHouH2BHXYQ,YF3KmwYMeONIfX02OTxwFw,798.2


In [26]:
user_business_lat_long_
location_plot = pd.merge(user_business_lat_long_,final_prediction, how="inner", on=["business_id",'user_id'])
location_plot 

Unnamed: 0,user_id,u_lat,u_long,business_id,latitude,longitude,score
0,8SgjU2fID9VqKQooFt1DIQ,33.481994,-112.064377,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835,870.8
1,0Bm2BA66nUSJtgpOI9XBPQ,33.46722,-112.070909,Ns20WGWn6s6niKAGIQB4UQ,33.451778,-112.076835,378.3
2,PxzBS6FXePo9p__FRVLMSg,33.458087,-112.070339,eS29S_06lvsDW04wVrIVxg,33.479016,-112.047963,756.8
3,sTcYq6goD1Fa2WS9MSkSvQ,33.48645,-112.070966,89uU51kOiQXbJHVA3C6XMQ,33.431499,-112.056034,734.9
4,sTcYq6goD1Fa2WS9MSkSvQ,33.48645,-112.070966,_WtxQbDK7B-ExGdeG-2j6Q,33.530393,-112.047055,746.9
5,8-elsQ1aqXXoOBBB6hzFsw,33.683323,-112.092428,7m1Oa1VYV98UUuo_6i0EZg,33.639913,-111.995703,750.2
6,O_GWZZfQx7qv-n-CN7hsIA,33.545344,-112.043044,7m1Oa1VYV98UUuo_6i0EZg,33.639913,-111.995703,1592.1
7,MMf0LhEk5tGa1LvN7zcDnA,33.46449,-112.064772,Wc9UpJhOcdSj7olZkz7SJA,33.495399,-112.005249,1020.1
8,e9V-hsaLlQIFvo5GnJIkbg,33.485323,-112.063226,3C5Z9homtzkWHouH2BHXYQ,33.487747,-112.065665,790.6
9,YF3KmwYMeONIfX02OTxwFw,33.500836,-112.007108,3C5Z9homtzkWHouH2BHXYQ,33.487747,-112.065665,798.2


In [27]:
map = folium.Map(location=[33.455036,-112.079285], zoom_start=7)
for index,row in location_plot.iterrows():
    user = (row['u_lat'],row['u_long'])
    rest = (row['latitude'],row['longitude'])
    folium.Marker(user,icon=folium.Icon(color='red')).add_to(map)
    folium.Marker(rest,icon=folium.Icon(color='green')).add_to(map)
    folium.PolyLine(locations=[user, rest], color='blue').add_to(map)
    
# for point in range(0, len(locationlist)):
#     folium.Marker(locationlist[point]).add_to(map)
map

In [35]:
# convert to (n, 2) nd-array format for heatmap
map = folium.Map(location=[33.455036,-112.079285], zoom_start=9)
stationArr = location_plot[['latitude', 'longitude']].as_matrix()
import folium.plugins
# plot heatmap
map.add_children(folium.plugins.HeatMap(stationArr, radius=15))
map

  This is separate from the ipykernel package so we can avoid doing imports until
  
