# Data Cleaning

In this part we are cleaning the data and extract new features from existing one.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import re
import os

In [8]:
df = pd.read_csv("../data/data.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)
df = df.drop("Unnamed: 0", axis = 1)


In [9]:
categ = pd.read_csv("../data/categories.txt")
categ.Cuisine = categ.Cuisine.str.rstrip()
categ = categ['Cuisine'].tolist()

In [10]:
df = df[df.categories.str.contains('|'.join(categ))]

In [11]:
temp = df.set_index('business_id').categories.str.split(', ', expand=True).stack()
temp = pd.get_dummies(temp).groupby(level=0).sum()
temp = temp.loc[:, (temp.sum() >= 500)]

In [12]:
result = pd.merge(df, temp, left_on='business_id', right_index=True,
                  how='inner', sort=False)
result = result.drop(['categories'], axis = 1)

In [13]:
result["state"].unique()

array(['ON', 'NC', 'AZ', 'OH', 'NV', 'PA', 'AB', 'QC', 'WI', 'IL', 'NY',
       'SC', 'NM', 'WA', 'TX', 'BC', 'XWY', 'AR', 'FL', 'XGL'],
      dtype=object)

In [14]:
result.city.value_counts()

Toronto                             6879
Las Vegas                           5849
Phoenix                             3594
MontrГ©al                           3084
Calgary                             2465
Charlotte                           2400
Pittsburgh                          2118
Scottsdale                          1441
Cleveland                           1249
Mississauga                         1239
Mesa                                1061
Madison                             1008
Tempe                                944
Henderson                            810
Chandler                             788
Markham                              730
Glendale                             640
Gilbert                              527
Scarborough                          444
Richmond Hill                        432
Brampton                             425
North York                           411
Vaughan                              384
Champaign                            379
Peoria          

## New features

Adding name length as feature

In [22]:
result['name_length']  = result['name'].str.len()

## Save the clean data

In [23]:
result.to_csv('../data/data_clean.csv')