In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import datasets
import pandas as pd


from math import sin, cos, sqrt, atan2, radians

#sentiment packages
from textblob import TextBlob

In [2]:
# load businesses with national neighbor data
business_df = pd.read_json('completed_distance_df.json', lines=False)


In [156]:
business_df.categories[22]

'Restaurants, Asian Fusion, Sushi Bars'

In [267]:
business_df['is_restaurant'] = business_df[['categories']].applymap(lambda x: helper(x, 'restaurant'))
business_df['is_shopping'] = business_df[['categories']].applymap(lambda x: helper(x, 'shopping'))
business_df['is_fitness'] = business_df[['categories']].applymap(lambda x: helper(x, 'fitness'))
business_df['is_gas'] = business_df[['categories']].applymap(lambda x: helper(x, 'gas'))

In [242]:
business_df.categories.unique()

array(['Movers, Local Services, Self Storage, Home Services',
       'Car Dealers, Auto Repair, Auto Parts & Supplies, Automotive',
       'Sushi Bars, Buffets, Restaurants, Chinese', ...,
       'American (Traditional), Restaurants, Sports Bars, Pubs, Bars, Diners, American (New), Nightlife',
       'Grocery, Automotive, Food, Gas Stations, Farmers Market, Shopping, Convenience Stores',
       'Trainers, Fitness & Instruction, Pilates, Gyms, Active Life'],
      dtype=object)

Unnamed: 0,categories
0,
1,
2,
3,
4,
...,...
69971,
69972,
69973,
69974,


In [263]:
restaurant_words = ['restaurant', 'bar', 'diner', 'pub', 'food']
shopping_words = ['shopping']
fitness_words = ['Trainer', 'Fitness', 'Pilates', 'Gym']
gas_words = ['gas']

def helper(x, business_type):
    if business_type == 'restaurant':
        dictionary = restaurant_words
    elif business_type == 'shopping':
        dictionary = shopping_words
    elif business_type == 'fitness':
        dictionary = fitness_words
    elif business_type == 'gas':
        dictionary = gas_words
        
    if type(x) != str:
        return 0
    x = x.split(",")
    for word in x:
        for word2 in dictionary:
            if word2.lower() in word.lower():
                return 1
    return 0

In [268]:
#even split of set
from sklearn.model_selection import train_test_split

closed_businesses = business_df[business_df['is_open'] == 0]
open_businesses = business_df[business_df['is_open'] == 1]

random_open, _ = train_test_split(open_businesses, train_size = closed_businesses.shape[0])

even_split_data = pd.concat([closed_businesses, random_open], ignore_index=True)


In [278]:
even_split_features = even_split_data[['stars', 'review_count', 'tip_count', 'chain', '.1_count', '.3_count', 'mean_tip_sentiment', 'is_restaurant', 'is_shopping', 'is_fitness', 'is_gas']]
even_split_targets = even_split_data['is_open']

In [104]:
from sklearn.preprocessing import MinMaxScaler

In [279]:
scaler = MinMaxScaler()
scaler.fit(even_split_features)
normalized_features = scaler.transform(even_split_features)

even_split_targets = even_split_targets.to_numpy()

In [280]:
#do not touch
training_features, test_features, training_targets, test_targets = train_test_split(normalized_features, even_split_targets, test_size = 1)


In [None]:
#experiment zone
experiment_even_split_features = even_split_data[['stars', 'review_count', 'tip_count', 'chain', '.1_count', '.3_count', 'mean_tip_sentiment']]
experiment_even_split_targets = even_split_data['is_open']

experiment_scaler = MinMaxScaler()
experiment_scaler.fit(even_split_features)
experiment_normalized_features = scaler.transform(even_split_features)

experiment_even_split_targets = even_split_targets.to_numpy()

experiment_training_features, experiment_test_features, experiment_training_targets, experiment_test_targets = train_test_split(experiment_normalized_features, experiment_even_split_targets, test_size = 1)


In [272]:
clf = LogisticRegression()
scores = cross_val_score(clf, training_features, training_targets, cv=5)
print(scores)

[0.65704918 0.64415301 0.65770492 0.65464481 0.63213115]


In [281]:
clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, training_features, training_targets, cv=5)
print('Illinois Businesses Boosting Classifier Scores', scores)
print('Illinois Businesses Boosting Classifier Mean Score', np.mean(scores))

Illinois Businesses Boosting Classifier Scores [0.66098361 0.67715847 0.67387978 0.67650273 0.67562842]
Illinois Businesses Boosting Classifier Mean Score 0.6728306010928963


In [137]:
clf = LogisticRegression()
scores = cross_val_score(clf, training_features, training_targets, cv=5, scoring='roc_auc')
print('National Neighbor Data Logistic Regression ROC AUC scores', scores)
print('National Neighbor Data Logistic Regression ROC AUC Mean score', np.mean(scores))
print()

clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, training_features, training_targets, cv=5, scoring='roc_auc')
print('National Neighbor Data Boosting Classifier ROC AUC scores', scores)
print('National Neighbor Data Boosting Classifier ROC AUC Mean score', np.mean(scores))

National Neighbor Data Logistic Regression ROC AUC scores [0.6490527  0.63211656 0.63835345 0.64230937 0.63410408]
National Neighbor Data Logistic Regression ROC AUC Mean score 0.6391872311116955

National Neighbor Data Boosting Classifier ROC AUC scores [0.70735397 0.69580649 0.68652956 0.69930529 0.68421973]
National Neighbor Data Boosting Classifier ROC AUC Mean score 0.694643007298779


In [126]:
clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, training_features, training_targets, cv=5)
print('Illinois Businesses Boosting Classifier Scores', scores)
print('Illinois Businesses Boosting Classifier Mean Score', np.mean(scores))

Illinois Businesses Boosting Classifier Scores [0.63825137 0.64546448 0.63300546 0.63693989 0.62622951]
Illinois Businesses Boosting Classifier Mean Score 0.6359781420765028
