In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import datasets
import pandas as pd

from math import sin, cos, sqrt, atan2, radians

#sentiment packages
from textblob import TextBlob

In [3]:
# load business_df dataframe with ALL additional columns
# run instead of cells below
business_df = pd.read_json('business_df.json', lines=False)
business_df.head()

# for saving business_df to json file
# business_df.to_json(r'business_df.json')

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,chain,tip_count,mean_tip_sentiment
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",0.252695,False,1,0.2
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",False,22,0.097434
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",False,37,0.475766
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,0.252695,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",False,0,0.252695
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",False,0,0.252695


In [2]:
#load business data
business_df = pd.read_json('business.json', lines=True)

#load tip data
tip_df = pd.read_json('tip.json', lines=True)

# Add Chain Column

In [3]:
# Create a dictionary where key=business_name, value=count of that business
business_names = {}
for index, tip in business_df.iterrows():
    business_name = tip['name']
    if business_name in business_names:
        business_names[business_name] += 1
    else:
        business_names[business_name] = 1

# Add a Boolean column 'chain' to business_df
# True if there are more than one business by the same name
business_df['chain'] = False
for index, business in business_df.iterrows():
    business_name = business['name']
    if business_names[business_name] > 1:
        business_df.at[index, 'chain'] = True

# Add Tip_Count Column

In [4]:
# create a dictionary of tips matched to business IDs
bzn_tips = {}
for index, tip in tip_df.iterrows():
    business_id = tip['business_id']
    if business_id in bzn_tips:
        bzn_tips[business_id] += 1
    else:
        bzn_tips[business_id] = 1

# Add a 'tip_count' column to businesses_df dataframe
business_df['tip_count'] = 0

for index, business in business_df.iterrows():
    business_id = business['business_id']
    if business_id in bzn_tips:
        business_df.at[index, 'tip_count'] = bzn_tips[business_id]

# Sentiment Analysis of Tips.json

In [5]:
#load tip sentiment data
tips_sentiment_df = pd.read_json('tips_with_sentiment.json', orient='records')

# Add mean_tip_sentiment Column

In [6]:
# aggregate mean sentiments by 'business_id'
mean_tips_sentiment = tips_sentiment_df.groupby('business_id').mean()[['sentiment']]

# Join/Append 'sentiment' column to business_df
business_df = business_df.join(mean_tips_sentiment, on='business_id')

# Fill NaNs with mean_sentiment
mean_sentiment = business_df['sentiment'].mean()
business_df = business_df.fillna(value=mean_sentiment)
business_df = business_df.rename(columns={"sentiment": "mean_tip_sentiment"})
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,chain,tip_count,mean_tip_sentiment
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",0.252695,False,1,0.2
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",False,22,0.097434
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",False,37,0.475766
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,0.252695,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",False,0,0.252695
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",False,0,0.252695


# Add Neighbor Columns
- Begin using illinois_business df instead of business_df

In [6]:
def get_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance


illinois_business = business_df[business_df['state'] == 'IL']

bus_loc = [[] for i in range(illinois_business.shape[0])]
count = 0
for index, row in illinois_business.iterrows():
    bus_loc[count].append(row['business_id'])
    bus_loc[count].append(row['latitude'])
    bus_loc[count].append(row['longitude'])
    count += 1
    
il_neighbors_close = [[] for i in range(len(bus_loc))]
il_neighbors_far = [[] for i in range(len(bus_loc))]

for i, biz1 in enumerate(bus_loc):
    for j, biz2 in enumerate(bus_loc):
        if i == j:
            continue
        distance = get_distance(biz1[1], biz1[2], biz2[1], biz2[2])
        if distance < 0.3:
            il_neighbors_far[i].append([biz2[0], distance])
        if distance < 0.1:
            il_neighbors_close[i].append([biz2[0], distance])
            

illinois_business['.1_km'] = il_neighbors_close
illinois_business['.3_km'] = il_neighbors_far

# number_neighbors_close = [0 for i in range(len(il_neighbors_close))]
# number_neighbors_far = [0 for i in range(len(il_neighbors_far))]


# for i, bzn in enumerate(il_neighbors_close):
#     number_neighbors_close[i] = len(bzn)
    
# for i, bzn in enumerate(il_neighbors_far):
#     number_neighbors_far[i] = len(bzn)

illinois_business['.1_count'] = illinois_business['.1_km'].apply(lambda x: len(x))
illinois_business['.3_count'] = illinois_business['.3_km'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

# 1) Logistic Regression of Illinois Businesses

In [7]:
il_LogReg_data = illinois_business[['stars', 'review_count', 'chain', 'tip_count', '.1_count', '.3_count', 'mean_tip_sentiment']]
il_LogReg_targets = illinois_business['is_open']
il_LogReg_data.head()

Unnamed: 0,stars,review_count,chain,tip_count,.1_count,.3_count,mean_tip_sentiment
289,3.0,11,False,0,2,5,0.252695
330,2.5,3,False,0,8,30,0.252695
356,2.5,72,False,4,5,23,0.495312
361,2.5,12,True,2,1,50,-0.585
368,4.0,5,False,1,15,30,0.494444


In [8]:
clf = LogisticRegression()
scores = cross_val_score(clf, il_LogReg_data, il_LogReg_targets, cv=5)
print('Logistic Regression Scores', scores)

Logistic Regression Scores [0.79328165 0.79844961 0.80569948 0.79533679 0.79274611]


# Gradient Boosting Classifier of Illinois Businesses

In [9]:
clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, il_LogReg_data, il_LogReg_targets, cv=5)
print('Illinois Businesses Boosting Classifier Scores', scores)

Illinois Businesses Boosting Classifier Scores [0.80620155 0.79328165 0.77720207 0.78756477 0.79015544]


# 1) Logistic Regression of National Businesses, No Neighbors

In [14]:
national_data = illinois_business[['stars', 'review_count', 'chain', 'tip_count', 'mean_tip_sentiment']]
national_targets = illinois_business['is_open']
il_LogReg_data.head()

Unnamed: 0,stars,review_count,chain,tip_count,.1_count,.3_count,mean_tip_sentiment
289,3.0,11,False,0,2,5,0.252695
330,2.5,3,False,0,8,30,0.252695
356,2.5,72,False,4,5,23,0.495312
361,2.5,12,True,2,1,50,-0.585
368,4.0,5,False,1,15,30,0.494444


In [15]:
clf = LogisticRegression()
scores = cross_val_score(clf, national_data, national_targets, cv=5)
print('Logistic Regression Scores, National', scores)

Logistic Regression Scores, National [0.79844961 0.79844961 0.80051813 0.80051813 0.80051813]


In [17]:
clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, national_data, national_targets, cv=5)
print('National Business Boosting Classifier Scores', scores)

National Business Boosting Classifier Scores [0.79586563 0.7881137  0.79274611 0.79015544 0.80051813]


# Identify Neighbors for National Set

In [19]:
business_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,chain,tip_count,mean_tip_sentiment
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",0.252695,False,1,0.2


In [22]:
state_dict = {}
for index, row in business_df.iterrows():
    state = row['state']
    if row['state'] in state_dict:
        state_dict[state] += 1
    else:
        state_dict[state] = 1

In [41]:
large_states = ["IL", 'PA', 'AZ', 'ON', 'NC', 'AB', 'NV', 'OH', "QC", "WI", "SC"]
large_state_df = business_df[business_df.state.isin(large_states)]
large_state_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,chain,tip_count,mean_tip_sentiment
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",0.252695,False,1,0.2
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",False,22,0.097434
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",False,37,0.475766
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,0.252695,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",False,0,0.252695
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",False,0,0.252695


In [39]:
for state in large_states:
    current_state_df = large_state_df[large_state_df.state == state]
    if current_state_df.shape[1] < 15000:
        bus_loc = [[] for i in range(current_state_df.shape[0])]
        count = 0
        for index, row in current_state_df.iterrows():
            bus_loc[count].append(row['business_id'])
            bus_loc[count].append(row['latitude'])
            bus_loc[count].append(row['longitude'])
            count += 1
    
        current_neighbors_close = [[] for i in range(len(bus_loc))]
        current_neighbors_far = [[] for i in range(len(bus_loc))]

        for i, biz1 in enumerate(bus_loc):
            if i % 1000 == 0:
                print(state, i)
        for j, biz2 in enumerate(bus_loc):
            if i == j:
                continue
            distance = get_distance(biz1[1], biz1[2], biz2[1], biz2[2])
            if distance < 0.3:
                current_neighbors_far[i].append([biz2[0], distance])
            if distance < 0.1:
                current_neighbors_close[i].append([biz2[0], distance])
            

        large_state_df['state' == state]['.1_km'] = current_neighbors_close
        large_state_df['state' == state]['.3_km'] = current_neighbors_far
        


AZ 0
AZ 1000
AZ 2000
AZ 3000
AZ 4000
AZ 5000
AZ 6000
AZ 7000
AZ 8000
AZ 9000
AZ 10000
AZ 11000
AZ 12000
AZ 13000
AZ 14000
AZ 15000
AZ 16000
AZ 17000
AZ 18000
AZ 19000
AZ 20000
AZ 21000
AZ 22000
AZ 23000
AZ 24000
AZ 25000
AZ 26000
AZ 27000
AZ 28000
AZ 29000
AZ 30000
AZ 31000
AZ 32000
AZ 33000
AZ 34000
AZ 35000
AZ 36000
AZ 37000
AZ 38000
AZ 39000
AZ 40000
AZ 41000
AZ 42000
AZ 43000
AZ 44000
AZ 45000
AZ 46000
AZ 47000
AZ 48000
AZ 49000
AZ 50000
AZ 51000
AZ 52000
AZ 53000
AZ 54000
AZ 55000
AZ 56000


KeyError: False

In [36]:
pa_business = large_state_df[large_state_df['state'] == 'PA']

bus_loc = [[] for i in range(pa_business.shape[0])]
count = 0
for index, row in pa_business.iterrows():
    bus_loc[count].append(row['business_id'])
    bus_loc[count].append(row['latitude'])
    bus_loc[count].append(row['longitude'])
    count += 1
    
pa_neighbors_close = [[] for i in range(len(bus_loc))]
pa_neighbors_far = [[] for i in range(len(bus_loc))]

for i, biz1 in enumerate(bus_loc):
    if i % 1000 == 0:
        print(i)
    for j, biz2 in enumerate(bus_loc):
        if i == j:
            continue
        distance = get_distance(biz1[1], biz1[2], biz2[1], biz2[2])
        if distance < 0.3:
            pa_neighbors_far[i].append([biz2[0], distance])
        if distance < 0.1:
            pa_neighbors_close[i].append([biz2[0], distance])
            

pa_business['.1_km'] = pa_neighbors_close
pa_business['.3_km'] = pa_neighbors_far

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:

illinois_business = business_df[business_df['state'] == 'IL']

bus_loc = [[] for i in range(illinois_business.shape[0])]
count = 0
for index, row in illinois_business.iterrows():
    bus_loc[count].append(row['business_id'])
    bus_loc[count].append(row['latitude'])
    bus_loc[count].append(row['longitude'])
    count += 1
    
il_neighbors_close = [[] for i in range(len(bus_loc))]
il_neighbors_far = [[] for i in range(len(bus_loc))]

for i, biz1 in enumerate(bus_loc):
    for j, biz2 in enumerate(bus_loc):
        if i == j:
            continue
        distance = get_distance(biz1[1], biz1[2], biz2[1], biz2[2])
        if distance < 0.3:
            il_neighbors_far[i].append([biz2[0], distance])
        if distance < 0.1:
            il_neighbors_close[i].append([biz2[0], distance])
            

illinois_business['.1_km'] = il_neighbors_close
illinois_business['.3_km'] = il_neighbors_far


# Task 2

In [10]:
# # 2a
# clf = LogisticRegression()
# scores = cross_val_score(clf, data, new_targets, cv=5)
# print('Logistic Regression Scores', scores)

# clf = ensemble.GradientBoostingClassifier()
# scores = cross_val_score(clf, data, new_targets, cv=5)
# print('Boosting Classifier Scores', scores)

In [11]:
# # 2b
# tuned_parameters = [{'max_depth': [3, 5],
#                      'n_estimators': [100, 200],
#                      'learning_rate': [0.1, 0.5]}]
# clf = ensemble.GradientBoostingClassifier()
# clf = GridSearchCV(clf, tuned_parameters)
# clf.fit(data, new_targets)

# print("Scores for parameter grid search:")
# print()
# means = clf.cv_results_['mean_test_score']
# stds = clf.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, clf.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r"
#           % (mean, std * 2, params))

In [12]:
# # 2c
# clf = LogisticRegression()
# scores = cross_val_score(clf, data, new_targets, cv=5, scoring='roc_auc')
# print('Logistic Regression ROC AUC scores', scores)


# clf = ensemble.GradientBoostingClassifier()
# scores = cross_val_score(clf, data, new_targets, cv=5, scoring='roc_auc')
# print('Boosting Classifier ROC AUC scores', scores)