# Data Cleaning

In this part we are cleaning the data and extract new features from existing one.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import re
import os
import ast

In [30]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv("../data/data.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)
df = df.drop("Unnamed: 0", axis = 1)


In [3]:
categ = pd.read_csv("../data/categories.txt")
categ.Cuisine = categ.Cuisine.str.rstrip()
categ = categ['Cuisine'].tolist()

In [4]:
df = df[df.categories.str.contains('|'.join(categ))]

In [5]:
def str_to_dict(x):
    if type(x) == float:
        return {}
    else:
        return ast.literal_eval(x)

In [10]:
df['hours'] = df['hours'].apply(str_to_dict)
df = pd.concat([df.drop(['hours'], axis=1), df['hours'].apply(pd.Series)], axis=1)

In [11]:
temp = df.set_index('business_id').categories.str.split(', ', expand=True).stack()
temp = pd.get_dummies(temp).groupby(level=0).sum()
temp = temp.loc[:, (temp.sum() >= 500)]

In [38]:
result = pd.merge(df, temp, left_on='business_id', right_index=True,
                  how='inner', sort=False)
result = result.drop(['categories'], axis = 1)

In [36]:
result.Monday

0             9:0-0:0
1         17:30-21:30
8             7:0-0:0
9            10:0-1:0
10           10:0-0:0
             ...     
108164     11:30-22:0
108165        0:0-0:0
108166      11:0-22:0
108171      12:0-21:0
108175        0:0-0:0
Name: Monday, Length: 51176, dtype: object

In [39]:
result['Music'] = result['Music'].apply(str_to_dict)
result = pd.concat([result.drop(['Music'], axis=1), result['Music'].apply(pd.Series)], axis=1)

In [84]:
result.head()

Unnamed: 0,address,business_id,city,is_open,latitude,longitude,name,postal_code,review_count,stars,state,Alcohol,Ambience,BikeParking,BusinessAcceptsCreditCards,BusinessParking,Caters,GoodForKids,GoodForMeal,HasTV,NoiseLevel,OutdoorSeating,RestaurantsAttire,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,WheelchairAccessible,WiFi,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,American (New),American (Traditional),Arts & Entertainment,Asian Fusion,Bakeries,Barbeque,Bars,Beer,Breakfast & Brunch,Buffets,Burgers,Cafes,Canadian (New),Caribbean,Caterers,Chicken Wings,Chinese,Cocktail Bars,Coffee & Tea,Comfort Food,Delis,Desserts,Diners,Ethnic Food,Event Planning & Services,Fast Food,Food,Food Delivery Services,French,Gastropubs,Gluten-Free,Greek,Grocery,Halal,Hot Dogs,Ice Cream & Frozen Yogurt,Indian,Italian,Japanese,Juice Bars & Smoothies,Korean,Latin American,Lounges,Mediterranean,Mexican,Middle Eastern,Nightlife,Pizza,Pubs,Restaurants,Salad,Sandwiches,Seafood,Soup,Specialty Food,Sports Bars,Steakhouses,Sushi Bars,Tex-Mex,Thai,Vegan,Vegetarian,Vietnamese,Wine & Spirits,Wine Bars,Anymusic,Opening_Mon,Closing_Mon,Opening_Tue,Closing_Tue,Opening_Wed,Closing_Wed,Opening_Thu,Closing_Thu,Opening_Fri,Closing_Fri,Opening_Sat,Closing_Sat,Opening_Sun,Closing_Sun
0,30 Eglinton Avenue W,QXAEGFB4oINsVuTFxEYKFQ,Mississauga,1,43.6054989743,-79.652288909,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,u'full_bar',"{'romantic': False, 'intimate': False, 'classy...",False,,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",False,u'loud',False,u'casual',False,True,2,True,True,True,,u'no',9:0-0:0,9:0-0:0,9:0-0:0,9:0-0:0,9:0-1:0,9:0-1:0,9:0-0:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,False,9:0,0:0,9:0,0:0,9:0,0:0,9:0,0:0,9:0,1:0,9:0,1:0,9:0,0:0
1,"10110 Johnston Rd, Ste 15",gnKjwL_1w79qoiV3IC_xQQ,Charlotte,1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,u'beer_and_wine',"{'romantic': False, 'intimate': False, 'touris...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,u'average',False,'casual',False,True,2,True,True,True,,u'no',17:30-21:30,,17:30-21:30,17:30-21:30,17:30-22:0,17:30-22:0,17:30-21:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,False,17:30,21:30,,,17:30,21:30,17:30,21:30,17:30,22:0,17:30,22:0,17:30,21:0
8,2450 E Indian School Rd,1Dfx3zM-rW4n-31KeC8sJg,Phoenix,1,33.4951941,-112.0285876,Taco Bell,85016,18,3.0,AZ,u'none',"{'romantic': False, 'intimate': False, 'touris...",,True,"{'garage': False, 'street': False, 'validated'...",,True,,False,,False,u'casual',False,True,1,False,,True,,u'no',7:0-0:0,7:0-0:0,7:0-0:0,7:0-1:0,7:0-1:0,7:0-1:0,7:0-0:0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,False,7:0,0:0,7:0,0:0,7:0,0:0,7:0,1:0,7:0,1:0,7:0,1:0,7:0,0:0
9,"119 Landings Dr, Ste 101",5t3KVdMnFgAYmSl1wYLhmA,Mooresville,1,35.5274098057,-80.8680032061,The Kilted Buffalo Langtree,28117,9,3.5,NC,'beer_and_wine',"{'touristy': False, 'hipster': False, 'romanti...",True,True,"{'garage': False, 'street': False, 'validated'...",,False,,True,'average',True,,,True,1,True,,,,'free',10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,12:0-1:0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,10:0,1:0,10:0,1:0,10:0,1:0,10:0,1:0,10:0,1:0,10:0,1:0,12:0,1:0
10,5981 Andrews Rd,fweCYi8FmbJXHCqLnwuk8w,Mentor-on-the-Lake,1,41.70852,-81.359556,Marco's Pizza,44060,16,4.0,OH,u'none',,True,True,,,True,,,,False,u'casual',True,True,2,False,,True,,,10:0-0:0,10:0-0:0,10:0-0:0,10:0-0:0,10:0-1:0,10:0-1:0,10:0-0:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,10:0,0:0,10:0,0:0,10:0,0:0,10:0,0:0,10:0,1:0,10:0,1:0,10:0,0:0


In [41]:
def musicfunc (row):
    if row['dj'] == True or row['background_music'] == True or row['jukebox'] == True or row['live'] == True or row['video'] == True or row['karaoke'] == True:
        return True
    else:
        return False
result['Anymusic'] = result.apply(lambda row: musicfunc(row), axis = 1)    
   

In [28]:
result['Anymusic'].value_counts()

False    49689
True      1487
Name: Anymusic, dtype: int64

In [44]:
result = result.loc[:, (result.isnull().sum(axis=0) <= 47000)]

In [133]:
import datetime
result[['Opening_Mon','Closing_Mon']] = result['Monday'].str.split('-',expand=True)
result[['Opening_Tue','Closing_Tue']] = result['Tuesday'].str.split('-',expand=True)
result[['Opening_Wed','Closing_Wed']] = result['Wednesday'].str.split('-',expand=True)
result[['Opening_Thu','Closing_Thu']] = result['Thursday'].str.split('-',expand=True)
result[['Opening_Fri','Closing_Fri']] = result['Friday'].str.split('-',expand=True)
result[['Opening_Sat','Closing_Sat']] = result['Saturday'].str.split('-',expand=True)
result[['Opening_Sun','Closing_Sun']] = result['Sunday'].str.split('-',expand=True)


In [134]:
for i in [col for col in result if col.startswith('Opening') or col.startswith('Closing')]:
    result[i] = pd.to_datetime(result[i], format = '%H:%M')

In [146]:
def working_hours(name, clhrs, ophrs):
    result['{}'.format(name)] = np.nan
    for index,row in result.iterrows():
        value = pd.Timedelta(row['{}'.format(clhrs)] - row['{}'.format(ophrs)]).seconds / 3600
        #datetime.datetime.combine(datetime.date.min, row['Closing_Wed']) - datetime.datetime.combine(datetime.date.min,row['Opening_Wed'] )
        result.set_value(index,'{}'.format(name),value)

In [150]:
working_hours('WD_Mon', 'Closing_Mon', 'Opening_Mon')
working_hours('WD_Tue', 'Closing_Tue', 'Opening_Tue')
working_hours('WD_Wed', 'Closing_Wed', 'Opening_Wed')
working_hours('WD_Thu', 'Closing_Thu', 'Opening_Thu')
working_hours('WD_Fri', 'Closing_Fri', 'Opening_Fri')
working_hours('WD_Sat', 'Closing_Sat', 'Opening_Sat')
working_hours('WD_Sun', 'Closing_Sun', 'Opening_Sun')

  


In [152]:
for i in [col for col in result if col.startswith('Opening') or col.startswith('Closing')]:
    result[i] = result[i].dt.time

In [153]:
result.head()

Unnamed: 0,address,business_id,city,is_open,latitude,longitude,name,postal_code,review_count,stars,state,Alcohol,Ambience,BikeParking,BusinessAcceptsCreditCards,BusinessParking,Caters,GoodForKids,GoodForMeal,HasTV,NoiseLevel,OutdoorSeating,RestaurantsAttire,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,WheelchairAccessible,WiFi,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,American (New),American (Traditional),Arts & Entertainment,Asian Fusion,Bakeries,Barbeque,Bars,Beer,Breakfast & Brunch,Buffets,Burgers,Cafes,Canadian (New),Caribbean,Caterers,Chicken Wings,Chinese,Cocktail Bars,Coffee & Tea,Comfort Food,Delis,Desserts,Diners,Ethnic Food,Event Planning & Services,Fast Food,Food,Food Delivery Services,French,Gastropubs,Gluten-Free,Greek,Grocery,Halal,Hot Dogs,Ice Cream & Frozen Yogurt,Indian,Italian,Japanese,Juice Bars & Smoothies,Korean,Latin American,Lounges,Mediterranean,Mexican,Middle Eastern,Nightlife,Pizza,Pubs,Restaurants,Salad,Sandwiches,Seafood,Soup,Specialty Food,Sports Bars,Steakhouses,Sushi Bars,Tex-Mex,Thai,Vegan,Vegetarian,Vietnamese,Wine & Spirits,Wine Bars,Anymusic,Opening_Mon,Closing_Mon,Opening_Tue,Closing_Tue,Opening_Wed,Closing_Wed,Opening_Thu,Closing_Thu,Opening_Fri,Closing_Fri,Opening_Sat,Closing_Sat,Opening_Sun,Closing_Sun,WD_Wed,WD_Mon,WD_Tue,WD_Thu,WD_Fri,WD_Sat,WD_Sun
0,30 Eglinton Avenue W,QXAEGFB4oINsVuTFxEYKFQ,Mississauga,1,43.6054989743,-79.652288909,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,u'full_bar',"{'romantic': False, 'intimate': False, 'classy...",False,,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",False,u'loud',False,u'casual',False,True,2,True,True,True,,u'no',9:0-0:0,9:0-0:0,9:0-0:0,9:0-0:0,9:0-1:0,9:0-1:0,9:0-0:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,False,09:00:00,00:00:00,09:00:00,00:00:00,09:00:00,00:00:00,09:00:00,00:00:00,09:00:00,01:00:00,09:00:00,01:00:00,09:00:00,00:00:00,15.0,15.0,15.0,15.0,16.0,16.0,15.0
1,"10110 Johnston Rd, Ste 15",gnKjwL_1w79qoiV3IC_xQQ,Charlotte,1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,u'beer_and_wine',"{'romantic': False, 'intimate': False, 'touris...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,u'average',False,'casual',False,True,2,True,True,True,,u'no',17:30-21:30,,17:30-21:30,17:30-21:30,17:30-22:0,17:30-22:0,17:30-21:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,False,17:30:00,21:30:00,NaT,NaT,17:30:00,21:30:00,17:30:00,21:30:00,17:30:00,22:00:00,17:30:00,22:00:00,17:30:00,21:00:00,4.0,4.0,,4.0,4.5,4.5,3.5
8,2450 E Indian School Rd,1Dfx3zM-rW4n-31KeC8sJg,Phoenix,1,33.4951941,-112.0285876,Taco Bell,85016,18,3.0,AZ,u'none',"{'romantic': False, 'intimate': False, 'touris...",,True,"{'garage': False, 'street': False, 'validated'...",,True,,False,,False,u'casual',False,True,1,False,,True,,u'no',7:0-0:0,7:0-0:0,7:0-0:0,7:0-1:0,7:0-1:0,7:0-1:0,7:0-0:0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,False,07:00:00,00:00:00,07:00:00,00:00:00,07:00:00,00:00:00,07:00:00,01:00:00,07:00:00,01:00:00,07:00:00,01:00:00,07:00:00,00:00:00,17.0,17.0,17.0,18.0,18.0,18.0,17.0
9,"119 Landings Dr, Ste 101",5t3KVdMnFgAYmSl1wYLhmA,Mooresville,1,35.5274098057,-80.8680032061,The Kilted Buffalo Langtree,28117,9,3.5,NC,'beer_and_wine',"{'touristy': False, 'hipster': False, 'romanti...",True,True,"{'garage': False, 'street': False, 'validated'...",,False,,True,'average',True,,,True,1,True,,,,'free',10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,10:0-1:0,12:0-1:0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,10:00:00,01:00:00,10:00:00,01:00:00,10:00:00,01:00:00,10:00:00,01:00:00,10:00:00,01:00:00,10:00:00,01:00:00,12:00:00,01:00:00,15.0,15.0,15.0,15.0,15.0,15.0,13.0
10,5981 Andrews Rd,fweCYi8FmbJXHCqLnwuk8w,Mentor-on-the-Lake,1,41.70852,-81.359556,Marco's Pizza,44060,16,4.0,OH,u'none',,True,True,,,True,,,,False,u'casual',True,True,2,False,,True,,,10:0-0:0,10:0-0:0,10:0-0:0,10:0-0:0,10:0-1:0,10:0-1:0,10:0-0:0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,10:00:00,00:00:00,10:00:00,00:00:00,10:00:00,00:00:00,10:00:00,00:00:00,10:00:00,01:00:00,10:00:00,01:00:00,10:00:00,00:00:00,14.0,14.0,14.0,14.0,15.0,15.0,14.0


In [14]:
result.city.value_counts()

Toronto                             6879
Las Vegas                           5849
Phoenix                             3594
MontrГ©al                           3084
Calgary                             2465
Charlotte                           2400
Pittsburgh                          2118
Scottsdale                          1441
Cleveland                           1249
Mississauga                         1239
Mesa                                1061
Madison                             1008
Tempe                                944
Henderson                            810
Chandler                             788
Markham                              730
Glendale                             640
Gilbert                              527
Scarborough                          444
Richmond Hill                        432
Brampton                             425
North York                           411
Vaughan                              384
Champaign                            379
Peoria          

## New features

Adding name length as feature

In [22]:
result['name_length']  = result['name'].str.len()

## Save the clean data

In [23]:
result.to_csv('../data/data_clean.csv')