# Restaurants Feature Selection

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Import the data
df_business = pd.read_csv(r"C:\Users\Utkarsh\Desktop\INSY-662\OneDrive_2022-10-07\Yelp CSV\yelp_business.csv")
df_attributes = pd.read_csv(r"C:\Users\Utkarsh\Desktop\INSY-662\OneDrive_2022-10-07\Yelp CSV\yelp_business_attributes.csv")

In [3]:
# Replace all 'Na' and 'None' with NULL
df_business = df_business.replace({'Na': np.nan}, regex = True)
df_attributes = df_attributes.replace({'Na': np.nan}, regex = True)

In [4]:
df_rating = df_business[['business_id', 'categories', 'stars', 'review_count']]
df_restaurants = df_rating[df_rating['categories'].str.contains('Restaurants|Food') == True]

#### df_other
ByAppointmentOnly, BusinessAcceptsCreditCards, RestaurantsPriceRange2, GoodFodKids, WheelchairAccessible, Alcohol, HasTV, NoiseLevel, RestaurantsAttire, RestaurantsGoodForGroups, WiFi, RestaurantsReservations, RestaurantsTakeOut, RestaurantsTableService, OutdoorSeating, RestaurantsDelivery, Smoking, DriveThru, DogsAllowed, RestaurantsCounterService
<hr>

#### df_dietary
DietaryRestrictions_dairy-free, DietaryRestrictions_gluten-free, DietaryRestrictions_vegan, DietaryRestrictions_kosher, DietaryRestrictions_halal, DietaryRestrictions_soy-free, DietaryRestrictions_vegetarian
<hr>

#### df_ambience
Ambience_romantic, Ambience_intimate, Ambience_classy, Ambience_hipster, Ambience_divey, Ambience_touristy, Ambience_trendy, Ambience_upscale, Ambience_casual
<hr>

#### df_parking
BusinessParking_garage, BusinessParking_street, BusinessParking_validated, BusinessParking_lot, BusinessParking_valet, BikeParking
<hr>

#### df_good
GoodForMeal_dessert, GoodForMeal_latenight, GoodForMeal_lunch, GoodForMeal_dinner, GoodForMeal_breakfast, GoodForMeal_brunch

In [5]:
df_dietary = df_attributes[['business_id', 'DietaryRestrictions_dairy-free', 'DietaryRestrictions_gluten-free',
                            'DietaryRestrictions_vegan', 'DietaryRestrictions_kosher',
                            'DietaryRestrictions_halal', 'DietaryRestrictions_soy-free',
                            'DietaryRestrictions_vegetarian']]
df_ambience = df_attributes[['business_id', 'Ambience_romantic', 'Ambience_intimate', 'Ambience_classy',
                            'Ambience_hipster', 'Ambience_divey', 'Ambience_touristy',
                            'Ambience_trendy', 'Ambience_upscale', 'Ambience_casual']]
df_parking = df_attributes[['business_id', 'BusinessParking_garage', 'BusinessParking_street',
                            'BusinessParking_validated', 'BusinessParking_lot',
                            'BusinessParking_valet', 'BikeParking']]
df_good = df_attributes[['business_id', 'GoodForMeal_dessert', 'GoodForMeal_latenight', 'GoodForMeal_lunch',
                         'GoodForMeal_dinner', 'GoodForMeal_breakfast', 'GoodForMeal_brunch']]
df_other = df_attributes[['business_id', 'ByAppointmentOnly', 'BusinessAcceptsCreditCards', 'RestaurantsPriceRange2',
                          'GoodForKids', 'WheelchairAccessible', 'Alcohol',
                          'HasTV', 'NoiseLevel', 'RestaurantsAttire',
                          'RestaurantsGoodForGroups', 'WiFi', 'RestaurantsReservations',
                          'RestaurantsTakeOut', 'RestaurantsTableService', 'OutdoorSeating',
                          'RestaurantsDelivery', 'Smoking', 'DriveThru',
                          'DogsAllowed', 'RestaurantsCounterService']]

## Recursive Feature Elimination on Other

In [6]:
feat_other = pd.merge(df_restaurants, df_other, on = 'business_id', how = 'inner')
pd.set_option('display.max_columns', None)

# We need to dummify the variables and to determine how, we can use:
# feat_other['GoodForKids'].value_counts()

# The entire RestaurantsPriceRange2 column is null so we can drop it
# We should also drop HasTV because the categories are full_bar and beer_and_wine
# There are no 'True' in RestaurantsGoodForGroups
# There is only one row that is not null in RestaurantsCounterService
feat_other = feat_other.drop(columns = ['RestaurantsPriceRange2', 'HasTV', 'RestaurantsGoodForGroups', 'RestaurantsCounterService', 'RestaurantsAttire', 'RestaurantsReservations'])

# GoodForKids

# ByAppointmentOnly, BusinessAcceptsCreditCards, WheelchairAccessible, Alcohol, NoiseLevel, RestaurantsTakeOut
# RestaurantsTableService, OutdoorSeating, RestaurantsDelivery, Smoking, DogsAllowed
feat_other = feat_other.replace({np.NaN : 0, 'False' : 0, 'True' : 1}, regex = True)

# DriveThru
feat_other['DriveThru'] = feat_other['DriveThru'].replace({np.NaN : 0, 'outdoor' : 1, 'yes' : 1, 'no': 0}, regex = True)

In [7]:
feat_other['stars']

0         3.5
1         4.0
2         5.0
3         4.5
4         4.0
         ... 
290444    2.5
290445    3.0
290446    3.0
290447    4.0
290448    3.0
Name: stars, Length: 290449, dtype: float64

In [8]:
from sklearn import preprocessing
from sklearn import utils

X = feat_other.iloc[:, 4:]
y = feat_other['stars']

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

np.unique(y_transformed)

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [9]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 5000)
rfe = RFE(lr, n_features_to_select = 1)
model1 = rfe.fit(X,y_transformed)

In [10]:
pd.DataFrame(list(zip(X.columns,model1.ranking_)), columns = ['predictor','ranking']).sort_values(by = ['ranking'])

Unnamed: 0,predictor,ranking
13,DogsAllowed,1
6,WiFi,2
7,RestaurantsTakeOut,3
11,Smoking,4
5,NoiseLevel,5
10,RestaurantsDelivery,6
12,DriveThru,7
3,WheelchairAccessible,8
1,BusinessAcceptsCreditCards,9
0,ByAppointmentOnly,10


## Recursive Feature Elimination on Dietary

In [11]:
feat_dietary = pd.merge(df_restaurants, df_dietary, on = 'business_id', how = 'inner')
pd.set_option('display.max_columns', None)

# The entire DietaryRestrictions_dairy-free column is null so we can drop it
# There is only three, four, and five rows that are True in DietaryRestrictions_halal, DietaryRestrictions_soy-free, DietaryRestrictions_vegetarian
feat_dietary = feat_dietary.drop(columns = ['DietaryRestrictions_dairy-free', 'DietaryRestrictions_halal', 'DietaryRestrictions_soy-free', 'DietaryRestrictions_vegetarian'])

# DietaryRestrictions_gluten-free, DietaryRestrictions_vegan, DietaryRestrictions_kosher
feat_dietary = feat_dietary.replace({np.NaN : 0, 'False' : 0, 'True' : 1}, regex = True)

In [12]:
from sklearn import preprocessing
from sklearn import utils

X = feat_dietary.iloc[:, 4:]
y = feat_dietary['stars']

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

np.unique(y_transformed)

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 5000)
rfe = RFE(lr, n_features_to_select = 1)
model2 = rfe.fit(X,y_transformed)

In [14]:
pd.DataFrame(list(zip(X.columns,model2.ranking_)), columns = ['predictor','ranking']).sort_values(by = ['ranking'])

Unnamed: 0,predictor,ranking
2,DietaryRestrictions_kosher,1
0,DietaryRestrictions_gluten-free,2
1,DietaryRestrictions_vegan,3


## Recursive Feature Elimination on Ambience

In [15]:
feat_dietary = pd.merge(df_restaurants, df_dietary, on = 'business_id', how = 'inner')
pd.set_option('display.max_columns', None)

# The entire DietaryRestrictions_dairy-free column is null so we can drop it
# There is only six, zero, zero, zero, zero, zero, zero, zero, zero rows that are True in Ambience_romantic, Ambience_intimate, Ambience_classy, Ambience_hipster, Ambience_divey, Ambience_touristy, Ambience_trendy, Ambience_upscale, Ambience_casual 

#Thus, Ambience data is not useful

## Recursive Feature Elimination on Parking

In [16]:
feat_parking = pd.merge(df_restaurants, df_parking, on = 'business_id', how = 'inner')
pd.set_option('display.max_columns', None)


feat_parking = feat_parking.replace({np.NaN : 0, 'False' : 0, 'True' : 1}, regex = True)

In [17]:
from sklearn import preprocessing
from sklearn import utils

X = feat_parking.iloc[:, 4:]
y = feat_parking['stars']

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

np.unique(y_transformed)

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [18]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 5000)
rfe = RFE(lr, n_features_to_select = 1)
model3 = rfe.fit(X,y_transformed)

In [19]:
pd.DataFrame(list(zip(X.columns,model3.ranking_)), columns = ['predictor','ranking']).sort_values(by = ['ranking'])

Unnamed: 0,predictor,ranking
3,BusinessParking_lot,1
1,BusinessParking_street,2
4,BusinessParking_valet,3
2,BusinessParking_validated,4
5,BikeParking,5
0,BusinessParking_garage,6


## Recursive Feature Elimination on Good

In [22]:
feat_good = pd.merge(df_restaurants, df_good, on = 'business_id', how = 'inner')
pd.set_option('display.max_columns', None)


feat_good = feat_good.replace({np.NaN : 0, 'False' : 0, 'True' : 1}, regex = True)

In [23]:
feat_good

Unnamed: 0,business_id,categories,stars,review_count,GoodForMeal_dessert,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_breakfast,GoodForMeal_brunch
0,PfOCPjBrlQAnz__NXj9h_w,American (New);Nightlife;Bars;Sandwiches;Ameri...,3.5,116,0,0,0,0,0,0
1,o9eMRCWt5PkpLDE0gOPtcQ,Italian;Restaurants,4.0,5,0,0,0,0,0,0
2,EsMcGiZaQuG1OOvL9iUFug,Coffee & Tea;Ice Cream & Frozen Yogurt;Food,5.0,15,0,0,0,0,0,0
3,XOSRcvtaKc_Q5H1SAzN20A,Breakfast & Brunch;Gluten-Free;Coffee & Tea;Fo...,4.5,3,0,0,0,0,0,0
4,xcgFnd-MwkZeO5G2HQ0gAQ,Bakeries;Bagels;Food,4.0,38,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
290444,pUhU5ohYv65g8B47dTXAKA,Coffee & Tea;Food,2.5,35,0,0,0,0,0,0
290445,FxtrmM1a_0jcNnxiEF_Emg,Restaurants;Beer Garden,3.0,21,0,0,0,0,0,0
290446,yJH_gq99aEj8xtyBaAQH8Q,German;Restaurants,3.0,52,0,0,0,0,0,0
290447,UdEmYOnk2iJDY9lpEPAlJQ,Pizza;Event Planning & Services;Italian;Catere...,4.0,374,0,0,0,0,0,0


In [24]:
from sklearn import preprocessing
from sklearn import utils

X = feat_good.iloc[:, 4:]
y = feat_good['stars']

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

np.unique(y_transformed)

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 5000)
rfe = RFE(lr, n_features_to_select = 1)
model4 = rfe.fit(X,y_transformed)

In [26]:
pd.DataFrame(list(zip(X.columns,model4.ranking_)), columns = ['predictor','ranking']).sort_values(by = ['ranking'])

Unnamed: 0,predictor,ranking
5,GoodForMeal_brunch,1
1,GoodForMeal_latenight,2
2,GoodForMeal_lunch,3
3,GoodForMeal_dinner,4
4,GoodForMeal_breakfast,5
0,GoodForMeal_dessert,6
