# Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, roc_curve, auc, make_scorer, roc_auc_score

# Styling 
from IPython.display import HTML, display
import tabulate

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

%matplotlib inline

# Read in data

In [2]:
yelp = pd.read_csv('./../../data/yelp_cleaned.csv')

In [3]:
yelp.head()

Unnamed: 0.1,Unnamed: 0,id,alias,name,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,distance,price1,cat1,cat2,latitude,longitude,zip_code,state,city,nyc,manhattan,brooklyn,staten_island,bronx,queens,borough,merclong,merclat
0,0,C3spvfEd8JQJie-yixRgpQ,com-tam-ninh-kieu-bronx,Com Tam Ninh Kieu,False,https://www.yelp.com/biz/com-tam-ninh-kieu-bro...,379,"[{'alias': 'vietnamese', 'title': 'Vietnamese'...",4.0,"{'latitude': 40.86702, 'longitude': -73.8982}","['delivery', 'pickup']",2,"{'address1': '2641 Jerome Ave', 'address2': ''...",8219.141546,$$,vietnamese,Vietnamese,40.86702,-73.8982,10468,NY,Bronx,True,False,False,False,True,False,bronx,-8226310.0,4992747.0
1,1,b6jOwyX4iaagw8YjXqq1sA,antonios-trattoria-bronx,Antonio's Trattoria,False,https://www.yelp.com/biz/antonios-trattoria-br...,773,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.854131, 'longitude': -73.886601}",[],2,"{'address1': '2370 Belmont Ave', 'address2': N...",1273.311721,$$,italian,Italian,40.854131,-73.886601,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225019.0,4990850.0
2,2,ciHt5n5rmpby1YcC_6JVrg,bronx-alehouse-bronx-2,Bronx Alehouse,False,https://www.yelp.com/biz/bronx-alehouse-bronx-...,749,"[{'alias': 'tradamerican', 'title': 'American ...",4.0,"{'latitude': 40.8847016520226, 'longitude': -7...",[],2,"{'address1': '216 W 238th St', 'address2': '',...",9528.584185,$$,tradamerican,American (Traditional),40.884702,-73.899498,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226454.0,4995350.0
3,4,KAxYFGyOQ7ysCmYmh8jKtw,the-bronx-public-bronx,The Bronx Public,False,https://www.yelp.com/biz/the-bronx-public-bron...,443,"[{'alias': 'sportsbars', 'title': 'Sports Bars...",4.0,"{'latitude': 40.87827, 'longitude': -73.90341}","['restaurant_reservation', 'delivery', 'pickup']",2,"{'address1': '170 W 231st St', 'address2': '',...",9315.419234,$$,sportsbars,Sports Bars,40.87827,-73.90341,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226890.0,4994403.0
4,5,tLZfepIfSf0a80zHQTbTGQ,trattoria-zero-otto-nove-bronx,Trattoria Zero Otto Nove,False,https://www.yelp.com/biz/trattoria-zero-otto-n...,749,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 40.8546515, 'longitude': -73.8883...",[],2,"{'address1': '2357 Arthur Ave', 'address2': ''...",1200.986458,$$,italian,Italian,40.854652,-73.888353,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225214.0,4990926.0


In [4]:
yelp_clusters = pd.read_csv('./../../data/cluster_zip_2.csv')

In [5]:
# Turn dataframe into a dictionary
cluster_dict = dict(zip(yelp_clusters['zipcode'], yelp_clusters['cluster']))

In [6]:
yelp.columns

Index(['Unnamed: 0', 'id', 'alias', 'name', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'coordinates', 'transactions', 'price',
       'location', 'distance', 'price1', 'cat1', 'cat2', 'latitude',
       'longitude', 'zip_code', 'state', 'city', 'nyc', 'manhattan',
       'brooklyn', 'staten_island', 'bronx', 'queens', 'borough', 'merclong',
       'merclat'],
      dtype='object')

In [7]:
yelp['cluster'] = [cluster_dict[zipcode] for zipcode in yelp['zip_code']]

In [8]:
yelp.head()

Unnamed: 0.1,Unnamed: 0,id,alias,name,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,distance,price1,cat1,cat2,latitude,longitude,zip_code,state,city,nyc,manhattan,brooklyn,staten_island,bronx,queens,borough,merclong,merclat,cluster
0,0,C3spvfEd8JQJie-yixRgpQ,com-tam-ninh-kieu-bronx,Com Tam Ninh Kieu,False,https://www.yelp.com/biz/com-tam-ninh-kieu-bro...,379,"[{'alias': 'vietnamese', 'title': 'Vietnamese'...",4.0,"{'latitude': 40.86702, 'longitude': -73.8982}","['delivery', 'pickup']",2,"{'address1': '2641 Jerome Ave', 'address2': ''...",8219.141546,$$,vietnamese,Vietnamese,40.86702,-73.8982,10468,NY,Bronx,True,False,False,False,True,False,bronx,-8226310.0,4992747.0,23
1,1,b6jOwyX4iaagw8YjXqq1sA,antonios-trattoria-bronx,Antonio's Trattoria,False,https://www.yelp.com/biz/antonios-trattoria-br...,773,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.854131, 'longitude': -73.886601}",[],2,"{'address1': '2370 Belmont Ave', 'address2': N...",1273.311721,$$,italian,Italian,40.854131,-73.886601,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225019.0,4990850.0,23
2,2,ciHt5n5rmpby1YcC_6JVrg,bronx-alehouse-bronx-2,Bronx Alehouse,False,https://www.yelp.com/biz/bronx-alehouse-bronx-...,749,"[{'alias': 'tradamerican', 'title': 'American ...",4.0,"{'latitude': 40.8847016520226, 'longitude': -7...",[],2,"{'address1': '216 W 238th St', 'address2': '',...",9528.584185,$$,tradamerican,American (Traditional),40.884702,-73.899498,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226454.0,4995350.0,58
3,4,KAxYFGyOQ7ysCmYmh8jKtw,the-bronx-public-bronx,The Bronx Public,False,https://www.yelp.com/biz/the-bronx-public-bron...,443,"[{'alias': 'sportsbars', 'title': 'Sports Bars...",4.0,"{'latitude': 40.87827, 'longitude': -73.90341}","['restaurant_reservation', 'delivery', 'pickup']",2,"{'address1': '170 W 231st St', 'address2': '',...",9315.419234,$$,sportsbars,Sports Bars,40.87827,-73.90341,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226890.0,4994403.0,58
4,5,tLZfepIfSf0a80zHQTbTGQ,trattoria-zero-otto-nove-bronx,Trattoria Zero Otto Nove,False,https://www.yelp.com/biz/trattoria-zero-otto-n...,749,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 40.8546515, 'longitude': -73.8883...",[],2,"{'address1': '2357 Arthur Ave', 'address2': ''...",1200.986458,$$,italian,Italian,40.854652,-73.888353,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225214.0,4990926.0,23


In [9]:
cluster_is_affluent = pd.read_csv('./../../data/cluster_is_affluent.csv')

In [10]:
cluster_is_affluent.head()

Unnamed: 0,cluster,is_affluent
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [11]:
cluster_is_affluent_dict = dict(zip(cluster_is_affluent['cluster'], cluster_is_affluent['is_affluent']))

In [12]:
yelp['is_affluent'] = [cluster_is_affluent_dict[cluster] for cluster in yelp['cluster']]

In [13]:
yelp.head()

Unnamed: 0.1,Unnamed: 0,id,alias,name,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,distance,price1,cat1,cat2,latitude,longitude,zip_code,state,city,nyc,manhattan,brooklyn,staten_island,bronx,queens,borough,merclong,merclat,cluster,is_affluent
0,0,C3spvfEd8JQJie-yixRgpQ,com-tam-ninh-kieu-bronx,Com Tam Ninh Kieu,False,https://www.yelp.com/biz/com-tam-ninh-kieu-bro...,379,"[{'alias': 'vietnamese', 'title': 'Vietnamese'...",4.0,"{'latitude': 40.86702, 'longitude': -73.8982}","['delivery', 'pickup']",2,"{'address1': '2641 Jerome Ave', 'address2': ''...",8219.141546,$$,vietnamese,Vietnamese,40.86702,-73.8982,10468,NY,Bronx,True,False,False,False,True,False,bronx,-8226310.0,4992747.0,23,0
1,1,b6jOwyX4iaagw8YjXqq1sA,antonios-trattoria-bronx,Antonio's Trattoria,False,https://www.yelp.com/biz/antonios-trattoria-br...,773,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.854131, 'longitude': -73.886601}",[],2,"{'address1': '2370 Belmont Ave', 'address2': N...",1273.311721,$$,italian,Italian,40.854131,-73.886601,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225019.0,4990850.0,23,0
2,2,ciHt5n5rmpby1YcC_6JVrg,bronx-alehouse-bronx-2,Bronx Alehouse,False,https://www.yelp.com/biz/bronx-alehouse-bronx-...,749,"[{'alias': 'tradamerican', 'title': 'American ...",4.0,"{'latitude': 40.8847016520226, 'longitude': -7...",[],2,"{'address1': '216 W 238th St', 'address2': '',...",9528.584185,$$,tradamerican,American (Traditional),40.884702,-73.899498,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226454.0,4995350.0,58,0
3,4,KAxYFGyOQ7ysCmYmh8jKtw,the-bronx-public-bronx,The Bronx Public,False,https://www.yelp.com/biz/the-bronx-public-bron...,443,"[{'alias': 'sportsbars', 'title': 'Sports Bars...",4.0,"{'latitude': 40.87827, 'longitude': -73.90341}","['restaurant_reservation', 'delivery', 'pickup']",2,"{'address1': '170 W 231st St', 'address2': '',...",9315.419234,$$,sportsbars,Sports Bars,40.87827,-73.90341,10463,NY,Bronx,True,False,False,False,True,False,bronx,-8226890.0,4994403.0,58,0
4,5,tLZfepIfSf0a80zHQTbTGQ,trattoria-zero-otto-nove-bronx,Trattoria Zero Otto Nove,False,https://www.yelp.com/biz/trattoria-zero-otto-n...,749,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 40.8546515, 'longitude': -73.8883...",[],2,"{'address1': '2357 Arthur Ave', 'address2': ''...",1200.986458,$$,italian,Italian,40.854652,-73.888353,10458,NY,Bronx,True,False,False,False,True,False,bronx,-8225214.0,4990926.0,23,0


In [14]:
yelp.columns

Index(['Unnamed: 0', 'id', 'alias', 'name', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'coordinates', 'transactions', 'price',
       'location', 'distance', 'price1', 'cat1', 'cat2', 'latitude',
       'longitude', 'zip_code', 'state', 'city', 'nyc', 'manhattan',
       'brooklyn', 'staten_island', 'bronx', 'queens', 'borough', 'merclong',
       'merclat', 'cluster', 'is_affluent'],
      dtype='object')

In [15]:
borough_cols = ['manhattan', 'brooklyn', 'staten_island', 'bronx', 'queens']
for i in borough_cols:
    yelp[i] = [1 if i == True else 0 for i in yelp[i]]

In [16]:
columns_to_keep = ['review_count', 'rating', 'price', 'cat1', 'cat2', 'manhattan',
                   'brooklyn', 'staten_island', 'bronx', 'queens', 'is_affluent']
yelp = yelp[columns_to_keep]

In [17]:
def dummify_categorical_columns(df):
    '''
    Dummify all categorical columns
    '''
    categorical_columns = df.select_dtypes(include="object").columns
    return pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [18]:
yelp = dummify_categorical_columns(yelp)

In [19]:
yelp.head()

Unnamed: 0,review_count,rating,price,manhattan,brooklyn,staten_island,bronx,queens,is_affluent,cat1_afghani,cat1_african,cat1_airportlounges,cat1_arcades,cat1_argentine,cat1_armenian,cat1_artmuseums,cat1_asianfusion,cat1_australian,cat1_austrian,cat1_bagels,cat1_bakeries,cat1_bangladeshi,cat1_barbers,cat1_bars,cat1_basque,cat1_bbq,cat1_beer_and_wine,cat1_beerbar,cat1_beergardens,cat1_belgian,cat1_boating,cat1_bookstores,cat1_bowling,cat1_boxing,cat1_brasseries,cat1_brazilian,cat1_breakfast_brunch,cat1_breweries,cat1_british,cat1_bubbletea,cat1_buffets,cat1_burgers,cat1_burmese,cat1_butcher,cat1_cafes,cat1_cafeteria,cat1_cajun,cat1_cakeshop,cat1_cambodian,cat1_candy,cat1_cantonese,cat1_caribbean,cat1_catering,cat1_champagne_bars,cat1_cheese,cat1_cheesesteaks,cat1_chicken_wings,cat1_chickenshop,cat1_chinese,cat1_chocolate,cat1_cigarbars,cat1_cocktailbars,cat1_coffee,cat1_coffeeroasteries,cat1_colombian,cat1_comedyclubs,cat1_comfortfood,cat1_convenience,cat1_cookingclasses,cat1_cookingschools,cat1_creperies,cat1_cuban,cat1_culturalcenter,cat1_cupcakes,cat1_customcakes,cat1_czech,cat1_danceclubs,cat1_delis,cat1_desserts,cat1_dimsum,cat1_diners,cat1_distilleries,cat1_divebars,cat1_diyfood,cat1_dominican,cat1_donuts,cat1_drugstores,cat1_eatertainment,cat1_egyptian,cat1_empanadas,cat1_ethiopian,cat1_eventplanning,cat1_falafel,cat1_farmersmarket,cat1_festivals,cat1_filipino,cat1_fishnchips,cat1_florists,cat1_fondue,cat1_food,cat1_food_court,cat1_fooddeliveryservices,cat1_foodstands,cat1_foodtrucks,cat1_french,cat1_galleries,cat1_gastropubs,cat1_gaybars,cat1_gelato,cat1_georgian,cat1_german,cat1_giftshops,cat1_gluten_free,cat1_golf,cat1_golflessons,cat1_gourmet,cat1_greek,cat1_grocery,cat1_hair,cat1_haitian,cat1_halal,cat1_hawaiian,cat1_healthmarkets,cat1_herbsandspices,cat1_himalayan,cat1_hkcafe,cat1_hobbyshops,cat1_hookah_bars,cat1_hotdog,cat1_hotdogs,cat1_hotels,cat1_hotpot,cat1_hungarian,cat1_icecream,cat1_indonesian,cat1_indpak,cat1_intlgrocery,cat1_irish,cat1_irish_pubs,cat1_italian,cat1_izakaya,cat1_japacurry,cat1_japanese,cat1_jazzandblues,cat1_juicebars,cat1_karaoke,cat1_kebab,cat1_kids_activities,cat1_kitchenandbath,cat1_korean,cat1_kosher,cat1_landmarks,cat1_laotian,cat1_latin,cat1_laundromat,cat1_lebanese,cat1_localservices,cat1_lounges,cat1_macarons,cat1_magicians,cat1_mags,cat1_malaysian,cat1_markets,cat1_meats,cat1_mediterranean,cat1_mexican,cat1_mideastern,cat1_modern_european,cat1_mongolian,cat1_moroccan,cat1_movietheaters,cat1_museums,cat1_musicvenues,cat1_musicvideo,cat1_newamerican,cat1_newmexican,cat1_nightlife,cat1_nonprofit,cat1_noodles,cat1_organic_stores,cat1_pakistani,cat1_panasian,cat1_pastashops,cat1_persian,cat1_peruvian,cat1_petstore,cat1_pianobars,cat1_pizza,cat1_poke,cat1_polish,cat1_poolhalls,cat1_popuprestaurants,cat1_portuguese,cat1_pretzels,cat1_publicmarkets,cat1_pubs,cat1_puertorican,cat1_ramen,cat1_restaurants,cat1_russian,cat1_salad,cat1_salvadoran,cat1_sandwiches,cat1_scandinavian,cat1_seafood,cat1_seafoodmarkets,cat1_senegalese,cat1_servicestations,cat1_shanghainese,cat1_sharedofficespaces,cat1_shavedice,cat1_shopping,cat1_shoppingcenters,cat1_sicilian,cat1_singaporean,cat1_skate_parks,cat1_soulfood,cat1_soup,cat1_southafrican,cat1_southern,cat1_spanish,cat1_spas,cat1_speakeasies,cat1_sportgoods,cat1_sports_clubs,cat1_sportsbars,cat1_sportswear,cat1_srilankan,cat1_steak,cat1_streetvendors,cat1_supperclubs,cat1_sushi,cat1_szechuan,cat1_tacos,cat1_taiwanese,cat1_tapas,cat1_tapasmallplates,cat1_tattoo,cat1_tea,cat1_teppanyaki,cat1_tex-mex,cat1_thai,cat1_theater,cat1_themedcafes,cat1_tikibars,cat1_tobaccoshops,cat1_tours,cat1_tradamerican,cat1_trinidadian,cat1_turkish,...,cat2_Arcades,cat2_Argentine,cat2_Armenian,cat2_Art Galleries,cat2_Art Museums,cat2_Asian Fusion,cat2_Australian,cat2_Austrian,cat2_Bagels,cat2_Bakeries,cat2_Bangladeshi,cat2_Barbeque,cat2_Barbers,cat2_Bars,cat2_Basque,cat2_Beer Bar,cat2_Beer Gardens,"cat2_Beer, Wine & Spirits",cat2_Belgian,cat2_Boating,cat2_Bookstores,cat2_Bowling,cat2_Boxing,cat2_Brasseries,cat2_Brazilian,cat2_Breakfast & Brunch,cat2_Breweries,cat2_British,cat2_Bubble Tea,cat2_Buffets,cat2_Burgers,cat2_Burmese,cat2_Butcher,cat2_Cafes,cat2_Cafeteria,cat2_Cajun/Creole,cat2_Cambodian,cat2_Candy Stores,cat2_Cantonese,cat2_Caribbean,cat2_Caterers,cat2_Champagne Bars,cat2_Cheese Shops,cat2_Cheesesteaks,cat2_Chicken Shop,cat2_Chicken Wings,cat2_Chinese,cat2_Chocolatiers & Shops,cat2_Cigar Bars,cat2_Cinema,cat2_Cocktail Bars,cat2_Coffee & Tea,cat2_Coffee Roasteries,cat2_Colombian,cat2_Comedy Clubs,cat2_Comfort Food,cat2_Community Service/Non-Profit,cat2_Convenience Stores,cat2_Cooking Classes,cat2_Cooking Schools,cat2_Creperies,cat2_Cuban,cat2_Cultural Center,cat2_Cupcakes,cat2_Custom Cakes,cat2_Czech,cat2_Dance Clubs,cat2_Day Spas,cat2_Delis,cat2_Desserts,cat2_Dim Sum,cat2_Diners,cat2_Distilleries,cat2_Dive Bars,cat2_Do-It-Yourself Food,cat2_Dominican,cat2_Donuts,cat2_Drugstores,cat2_Eatertainment,cat2_Egyptian,cat2_Empanadas,cat2_Ethiopian,cat2_Falafel,cat2_Farmers Market,cat2_Fast Food,cat2_Festivals,cat2_Filipino,cat2_Fish & Chips,cat2_Florists,cat2_Fondue,cat2_Food,cat2_Food Court,cat2_Food Delivery Services,cat2_Food Stands,cat2_Food Trucks,cat2_French,cat2_Fruits & Veggies,cat2_Gas Stations,cat2_Gastropubs,cat2_Gay Bars,cat2_Gelato,cat2_Georgian,cat2_German,cat2_Gift Shops,cat2_Gluten-Free,cat2_Golf,cat2_Golf Lessons,cat2_Greek,cat2_Grocery,cat2_Hair Salons,cat2_Haitian,cat2_Halal,cat2_Hawaiian,cat2_Health Markets,cat2_Herbs & Spices,cat2_Himalayan/Nepalese,cat2_Hobby Shops,cat2_Hong Kong Style Cafe,cat2_Hookah Bars,cat2_Hot Dogs,cat2_Hot Pot,cat2_Hotels,cat2_Hungarian,cat2_Ice Cream & Frozen Yogurt,cat2_Indian,cat2_Indonesian,cat2_International Grocery,cat2_Irish,cat2_Irish Pub,cat2_Italian,cat2_Izakaya,cat2_Japanese,cat2_Japanese Curry,cat2_Jazz & Blues,cat2_Juice Bars & Smoothies,cat2_Karaoke,cat2_Kebab,cat2_Kids Activities,cat2_Kitchen & Bath,cat2_Korean,cat2_Kosher,cat2_Landmarks & Historical Buildings,cat2_Laotian,cat2_Latin American,cat2_Laundromat,cat2_Lebanese,cat2_Local Services,cat2_Lounges,cat2_Macarons,cat2_Magicians,cat2_Malaysian,cat2_Meat Shops,cat2_Mediterranean,cat2_Mexican,cat2_Middle Eastern,cat2_Modern European,cat2_Mongolian,cat2_Moroccan,cat2_Museums,cat2_Music & DVDs,cat2_Music Venues,cat2_New Mexican Cuisine,cat2_Newspapers & Magazines,cat2_Nightlife,cat2_Noodles,cat2_Organic Stores,cat2_Pakistani,cat2_Pan Asian,cat2_Party & Event Planning,cat2_Pasta Shops,cat2_Patisserie/Cake Shop,cat2_Performing Arts,cat2_Persian/Iranian,cat2_Peruvian,cat2_Pet Stores,cat2_Piano Bars,cat2_Pizza,cat2_Poke,cat2_Polish,cat2_Pool Halls,cat2_Pop-Up Restaurants,cat2_Portuguese,cat2_Pretzels,cat2_Public Markets,cat2_Pubs,cat2_Puerto Rican,cat2_Ramen,cat2_Restaurants,cat2_Russian,cat2_Salad,cat2_Salvadoran,cat2_Sandwiches,cat2_Scandinavian,cat2_Seafood,cat2_Seafood Markets,cat2_Senegalese,cat2_Shanghainese,cat2_Shared Office Spaces,cat2_Shaved Ice,cat2_Shopping,cat2_Shopping Centers,cat2_Sicilian,cat2_Singaporean,cat2_Skate Parks,cat2_Soul Food,cat2_Soup,cat2_South African,cat2_Southern,cat2_Spanish,cat2_Speakeasies,cat2_Specialty Food,cat2_Sporting Goods,cat2_Sports Bars,cat2_Sports Clubs,cat2_Sports Wear,cat2_Sri Lankan,cat2_Steakhouses,cat2_Street Vendors,cat2_Supper Clubs,cat2_Sushi Bars,cat2_Szechuan,cat2_Tacos,cat2_Taiwanese,cat2_Tapas Bars,cat2_Tapas/Small Plates,cat2_Tattoo,cat2_Tea Rooms,cat2_Teppanyaki,cat2_Tex-Mex,cat2_Thai,cat2_Themed Cafes,cat2_Tiki Bars,cat2_Tobacco Shops,cat2_Tours,cat2_Trinidadian,cat2_Turkish,cat2_Tuscan,cat2_Ukrainian,cat2_Uzbek,cat2_Vape Shops,cat2_Vegan,cat2_Vegetarian,cat2_Venezuelan,cat2_Venues & Event Spaces,cat2_Vietnamese,cat2_Vinyl Records,cat2_Waffles,cat2_Whiskey Bars,cat2_Wine Bars,cat2_Wraps
0,379,4.0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,773,4.5,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,749,4.0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,443,4.0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,749,4.0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
yelp.shape

(19830, 519)

# Model Preparation

In [21]:
target = 'is_affluent'
X = yelp.drop(columns=target)
y = yelp[target]

# Train, Test Split
1. Since our dataset is large enough (n=10_000), we will reserve 20% (2k) of the data as test data. 
2. We will use 'stratify = y' to ensure the classes of y are balanced in both train and test.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

### Check distributions of Y to see if we have balanced classes

In [23]:
# Check distribution of y, to see if y has unblanced classes
y.value_counts(normalize=True)

1    0.687998
0    0.312002
Name: is_affluent, dtype: float64

In [24]:
# Check distribution of y_train
y_train.value_counts(normalize=True)

1    0.687973
0    0.312027
Name: is_affluent, dtype: float64

In [25]:
# Check distribution of y_test
y_test.value_counts(normalize=True)

1    0.688099
0    0.311901
Name: is_affluent, dtype: float64

# Modeling: First Round

- In the modeling process, I will first present my baseline model.  
- I will then develop 4 major categories of models: Naive Bayes Models (Multinomial and Gaussian), Logistic Regression Models, KNN model, and SVM. 
- Based on the grid search's cross validation score, I will then fine tune the hyper-parameters for each model to achieve the optimal result. 

## Evaluation Metric

- Before I go into developing the models, I will briefly discuss the evaluation metric: AUC ROC. 

- "ROC (Receiver Operating Characteristic) is a probability curve and AUC represents degree or measure of separability. It tells how much model is capable of distinguishing between classes"[(reference)](https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5). The higher the ROC, and the better it seperates the two classes r/startrek and r/StarWars. The ROC curve is plotted with True Positive Rate on y-aixs, against the False Positive Rate on the x-axis.

- AUC (Area Under the Curve) measures the entire two-dimensional area underneath the entire ROC curve. The closer AUC to 1, the better the model.

- I choose AUC ROC since I aim to find the optimal model that can separate the two classes r/startrek and r/StarWars really well. 

- I will also consider the amount of time each model take to fit. If two models have similar AUC ROC scores, the one that runs faster will be the model we choose. 

##  Baseline Model

In [26]:
y_test.value_counts(normalize=True)[1]

0.6880988401412002

### Logistic Regression

In [27]:
pipe = Pipeline(steps=[
    ('ss', StandardScaler()), 
    ('lr', LogisticRegression(solver='liblinear'))
])

pipe_params ={
    'lr__C': [0.0001, 1],
    'lr__penalty' : ['l1', 'l2']
    }

gs_lr = GridSearchCV(pipe, 
                 pipe_params, 
                 cv=5,
                 scoring='roc_auc')


In [28]:
gs_lr.fit(X_train, y_train);

In [29]:
# Check the optimal params
gs_lr.best_params_

{'lr__C': 1, 'lr__penalty': 'l1'}

In [30]:
# Train score
gs_lr.score(X_train, y_train)

0.9253984318167936

In [31]:
# Cross val score on the whole dataset 
cross_val_score(gs_lr.best_estimator_, X, y, scoring='roc_auc', cv=5).mean()

0.8973343350158386

In [32]:
gs_lr.score(X_test, y_test)

0.9231005757792363