# Initial Data Cleaning: Google Data Set

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Initial-Data-Cleaning:-Google-Data-Set" data-toc-modified-id="Initial-Data-Cleaning:-Google-Data-Set-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Initial Data Cleaning: Google Data Set</a></span><ul class="toc-item"><li><span><a href="#Data-Dictionary" data-toc-modified-id="Data-Dictionary-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Data Dictionary</a></span></li><li><span><a href="#Import-libraries" data-toc-modified-id="Import-libraries-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Import libraries</a></span></li><li><span><a href="#Read-in-data" data-toc-modified-id="Read-in-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Read in data</a></span></li><li><span><a href="#Initial-Data-Cleaning" data-toc-modified-id="Initial-Data-Cleaning-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Initial Data Cleaning</a></span><ul class="toc-item"><li><span><a href="#Drop-columns:-'Unnamed:-0',-'icon',-and-'photos'" data-toc-modified-id="Drop-columns:-'Unnamed:-0',-'icon',-and-'photos'-1.4.1"><span class="toc-item-num">1.4.1&nbsp;&nbsp;</span>Drop columns: 'Unnamed: 0', 'icon', and 'photos'</a></span></li><li><span><a href="#Get-zip-code-(and-city-or-state)-from-'formatted_address',-and-create-new-columns" data-toc-modified-id="Get-zip-code-(and-city-or-state)-from-'formatted_address',-and-create-new-columns-1.4.2"><span class="toc-item-num">1.4.2&nbsp;&nbsp;</span>Get zip code (and city or state) from 'formatted_address', and create new columns</a></span></li><li><span><a href="#Get-lat,-lng-from-'geometry',-and-create-new-columns" data-toc-modified-id="Get-lat,-lng-from-'geometry',-and-create-new-columns-1.4.3"><span class="toc-item-num">1.4.3&nbsp;&nbsp;</span>Get lat, lng from 'geometry', and create new columns</a></span></li><li><span><a href="#Get-'compound_code',-'global_code'-from-'plus_code',-and-create-new-columns" data-toc-modified-id="Get-'compound_code',-'global_code'-from-'plus_code',-and-create-new-columns-1.4.4"><span class="toc-item-num">1.4.4&nbsp;&nbsp;</span>Get 'compound_code', 'global_code' from 'plus_code', and create new columns</a></span></li><li><span><a href="#Use-CountVectorizer-to-process-'types'" data-toc-modified-id="Use-CountVectorizer-to-process-'types'-1.4.5"><span class="toc-item-num">1.4.5&nbsp;&nbsp;</span>Use CountVectorizer to process 'types'</a></span></li><li><span><a href="#Drop-duplicates" data-toc-modified-id="Drop-duplicates-1.4.6"><span class="toc-item-num">1.4.6&nbsp;&nbsp;</span>Drop duplicates</a></span></li><li><span><a href="#Shuffle-the-dataset" data-toc-modified-id="Shuffle-the-dataset-1.4.7"><span class="toc-item-num">1.4.7&nbsp;&nbsp;</span>Shuffle the dataset</a></span></li></ul></li><li><span><a href="#Export-clean-dataset-as-csv" data-toc-modified-id="Export-clean-dataset-as-csv-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Export clean dataset as csv</a></span></li></ul></li></ul></div>

## Data Dictionary

[Data Dictionary Link](https://developers.google.com/places/web-service/search#PlaceSearchResults)

[Differecne between id and place_id](https://stackoverflow.com/questions/27198283/google-places-api-are-place-id-or-id-unique-to-any-city-in-the-world)

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import ast 
import re
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle

# Display Preference
pd.set_option('display.max_columns', None)

## Read in data

In [2]:
df = pd.read_csv('../data/raw_google_data_nyc.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,formatted_address,geometry,icon,id,name,opening_hours,photos,place_id,plus_code,price_level,rating,reference,types,user_ratings_total,searched_keyword,searched_zipcode
0,0,"138 W 34th St, New York, NY 10001, United States","{'location': {'lat': 40.750269, 'lng': -73.989...",https://maps.gstatic.com/mapfiles/place_api/ic...,cab18c9c7ea5cf2330fdf146a7cbfe9a3ab03d6d,Sprint Store,{'open_now': True},"[{'height': 2592, 'html_attributions': ['<a hr...",ChIJCV0vTalZwokR61PIDIe0gI0,"{'compound_code': 'Q226+44 New York', 'global_...",2.0,3.4,ChIJCV0vTalZwokR61PIDIe0gI0,"['point_of_interest', 'store', 'establishment']",276,stores,10001
1,1,"460 8th Ave, New York, NY 10001, United States","{'location': {'lat': 40.751744, 'lng': -73.993...",https://maps.gstatic.com/mapfiles/place_api/ic...,e386d17b32833d39a246d0f8ed3df43ed5f27252,Duane Reade,{'open_now': True},"[{'height': 4896, 'html_attributions': ['<a hr...",ChIJJwQ3561ZwokRuYknT0uxER8,"{'compound_code': 'Q224+MJ New York', 'global_...",2.0,3.9,ChIJJwQ3561ZwokRuYknT0uxER8,"['convenience_store', 'food', 'point_of_intere...",50,stores,10001
2,2,"151 W 34th St, New York, NY 10001, United States","{'location': {'lat': 40.7508025, 'lng': -73.98...",https://maps.gstatic.com/mapfiles/place_api/ic...,e04114820206890ff0155d2f7a6f7efc0903fb9b,Macy's,{'open_now': True},"[{'height': 2610, 'html_attributions': ['<a hr...",ChIJ3xjWra5ZwokRrwJ0KZ4yKNs,"{'compound_code': 'Q226+86 New York', 'global_...",2.0,4.4,ChIJ3xjWra5ZwokRrwJ0KZ4yKNs,"['department_store', 'shoe_store', 'jewelry_st...",51032,stores,10001
3,3,"31 W 34th St, New York, NY 10001, United States","{'location': {'lat': 40.7494884, 'lng': -73.98...",https://maps.gstatic.com/mapfiles/place_api/ic...,5d4f6b7eff3d8c65cf9cee09eddc7cb81e77e742,Uniqlo,{'open_now': True},"[{'height': 4032, 'html_attributions': ['<a hr...",ChIJX6mEEqlZwokRtDQOWIDVc3I,"{'compound_code': 'P2X7+Q8 New York', 'global_...",1.0,4.4,ChIJX6mEEqlZwokRtDQOWIDVc3I,"['clothing_store', 'point_of_interest', 'store...",3962,stores,10001
4,4,"420 9th Ave, New York, NY 10001, United States","{'location': {'lat': 40.7529454, 'lng': -73.99...",https://maps.gstatic.com/mapfiles/place_api/ic...,3b2cbe32c41a5633864a49f9730d1c1388cbc37a,B&H Photo Video - Electronics and Camera Store,{'open_now': False},"[{'height': 806, 'html_attributions': ['<a hre...",ChIJI93dPbJZwokRIoOEoivEDQs,"{'compound_code': 'Q233+5F New York', 'global_...",,4.6,ChIJI93dPbJZwokRIoOEoivEDQs,"['electronics_store', 'home_goods_store', 'poi...",22772,stores,10001


In [4]:
# Check the shape of the data
df.shape

(10626, 17)

In [5]:
# Check data types
df.dtypes

Unnamed: 0              int64
formatted_address      object
geometry               object
icon                   object
id                     object
name                   object
opening_hours          object
photos                 object
place_id               object
plus_code              object
price_level           float64
rating                float64
reference              object
types                  object
user_ratings_total      int64
searched_keyword       object
searched_zipcode        int64
dtype: object

In [6]:
# Check nulls
df.isnull().sum()

Unnamed: 0               0
formatted_address        0
geometry                 0
icon                     0
id                       0
name                     0
opening_hours          317
photos                 649
place_id                 0
plus_code                4
price_level           3656
rating                   0
reference                0
types                    0
user_ratings_total       0
searched_keyword         0
searched_zipcode         0
dtype: int64

## Initial Data Cleaning

### Drop columns: 'Unnamed: 0', 'icon', and 'photos'

In [7]:
df.drop(columns=['Unnamed: 0', 'icon', 'id', 'photos', 'reference'], inplace=True)

In [8]:
# Change 'opening_hours' from str to bool
df['opening_hours'] = [ast.literal_eval(df['opening_hours'][i]).get('open_now') 
                       if pd.isnull(df['opening_hours'][i]) is False else
                       df['opening_hours'][i]
                       for i in df.index ]
df.rename(columns={'opening_hours':'open_now'}, inplace=True )

### Get zip code (and city or state) from 'formatted_address', and create new columns

In [9]:
# # Regular expression reference: https://regex101.com/
# ADDRESS_RE = re.compile(r'^(.*, +)?(?P<city>.*),( +(?P<state>[A-Z]{2}))? +(?P<zipcode>[0-9\-]*), +United States$')

In [10]:
# # Define a funciton to match the regular expression constant above
# def parse_address(string):
#     match = re.match(ADDRESS_RE, string)
    
#     # If match fails, raise error showing the failed match string
#     if match is None:
#         raise Exception(string)
        
#     #  Return a dictionary object
#     address_dict = match.groupdict()
    
#     # Return 'None' if the address is missing 'state'. 
#     if 'state' not in address_dict:
#         address_dict['state'] = None
        
#     return address_dict

In [11]:
ZIPCODE_RE = re.compile(r'\b\d{5}(-\d{4})?\b')                                   

In [12]:
# Define a function to match the regular expression constant above
def parse_zipcode_from_address(string):
    match = re.search(ZIPCODE_RE, string)
    
    # If match fails, raise error showing the failed match string
    if match is None:
        #raise Exception(string)
        zipcode = None
        print(f'no zipcode {string}')
    #  Return a dictionary object
    else:
        zipcode = match.group(0)
        
    return {'zipcode': zipcode}

In [13]:
# Apply the parse_address function to column 'formatted_address'
df = pd.concat([pd.DataFrame(
    list(df['formatted_address'].apply(parse_zipcode_from_address).values)), df], 
    axis=1, copy=True)

In [14]:
# All zipcodes appear in the formatted address from which they're taken
assert all([x in y for (x, y) in df[['zipcode', 'formatted_address']].values])

In [15]:
df.tail()

Unnamed: 0,zipcode,formatted_address,geometry,name,open_now,place_id,plus_code,price_level,rating,types,user_ratings_total,searched_keyword,searched_zipcode
10621,11223,"267 Avenue X, Brooklyn, NY 11223, United States","{'location': {'lat': 40.5904302, 'lng': -73.97...",Dunkin',True,ChIJ93JjNVdEwokRoJ5EXFbJbGU,"{'compound_code': 'H2RG+5V Brooklyn, New York'...",1.0,3.9,"['bakery', 'cafe', 'restaurant', 'food', 'poin...",250,coffee shops,11697
10622,11234,"2344 Flatbush Ave, Brooklyn, NY 11234, United ...","{'location': {'lat': 40.6128004, 'lng': -73.92...",Dunkin',True,ChIJsRdEALNbwokR9AA1NobdeOo,"{'compound_code': 'J37F+4F Brooklyn, New York'...",1.0,3.8,"['bakery', 'cafe', 'restaurant', 'food', 'poin...",232,coffee shops,11697
10623,11234,"2317 Ralph Ave, Brooklyn, NY 11234, United States","{'location': {'lat': 40.6213564, 'lng': -73.91...",Dunkin',True,ChIJ0RIGkohEwokRywOZz3ipDzw,"{'compound_code': 'J3CM+G6 Brooklyn, New York'...",1.0,4.2,"['bakery', 'cafe', 'restaurant', 'food', 'poin...",149,coffee shops,11697
10624,11234,"4926 Avenue K, Brooklyn, NY 11234, United States","{'location': {'lat': 40.6256151, 'lng': -73.92...",Dunkin',True,ChIJ77AulDdDwokRshZ8HOc_jsc,"{'compound_code': 'J3GC+6V Brooklyn, New York'...",1.0,3.8,"['bakery', 'cafe', 'restaurant', 'food', 'poin...",177,coffee shops,11697
10625,11235,"1745 Sheepshead Bay Rd, Brooklyn, NY 11235, Un...","{'location': {'lat': 40.5850653, 'lng': -73.95...",Orso Coffee,False,ChIJN5qMU2REwokRFMIbh5wdUhg,"{'compound_code': 'H2PX+2H Brooklyn, New York'...",2.0,4.6,"['cafe', 'food', 'point_of_interest', 'store',...",196,coffee shops,11697


In [16]:
# Drop the 'formatted_address' column
df.drop(columns='formatted_address', inplace=True)

### Get lat, lng from 'geometry', and create new columns

In [17]:
def parse_geometry(df):
    # Catch observations where geometry contains NaNs
    lat_list = []
    long_list = []
    for i in df.index:
        geometry = df['geometry'][i]
        try:
            lat = ast.literal_eval(geometry).get('location').get('lat')
            long = ast.literal_eval(geometry).get('location').get('lng')
        except ValueError:
            print(f'Error evaling {geometry} at {i} on {df.iloc[i]}')
            continue
            
        lat_list.append(lat)
        long_list.append(long)
        
    df['location_lat'] = lat_list
    df['location_lng'] = long_list
    return df

In [18]:
df = parse_geometry(df)

In [19]:
# Drop the 'geometry' column
df.drop(columns='geometry', inplace=True)

### Get 'compound_code', 'global_code' from 'plus_code', and create new columns

'plus_code' is is an encoded location reference, derived from latitude and longitude coordinates, that represents an area: 1/8000th of a degree by 1/8000th of a degree (about 14m x 14m at the equator) or smaller. Plus codes can be used as a replacement for street addresses in places where they do not exist (where buildings are not numbered or streets are not named). [Reference](https://developers.google.com/places/web-service/search#PlaceSearchResults)

In [20]:
df['compound_code'] = [ast.literal_eval(df['plus_code'][i]).get('compound_code')
                       if pd.isnull(df['plus_code'][i]) is False else
                       df['plus_code'][i]
                       for i in df.index]
df['global_code'] = [ast.literal_eval(df['plus_code'][i]).get('global_code')
                     if pd.isnull(df['plus_code'][i]) is False else
                     df['plus_code'][i]
                     for i in df.index]

In [21]:
# Drop the 'plus_code' column
df.drop(columns='plus_code', inplace=True)

### Use CountVectorizer to process 'types'

The column 'types' is a column of lists, each contains a list of business types such as 'bakery', 'bar' etc. Each business on Google has multiple types. I will get all the types in the dataset, and turn each type into a column. I will use CountVectorizer() to achieve this task.

In [22]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['types'])

In [23]:
vectorizerized_types = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [24]:
df = pd.concat([df, vectorizerized_types], axis=1, copy=True)

In [25]:
df.drop(columns='types', inplace=True)

### Drop duplicates
The 'place_id' is a unique identifier. I will use it to drop duplicated values from the dataset.

In [26]:
df.drop_duplicates(subset=['place_id'], keep='first', inplace=True)

### Shuffle the dataset
The current dataset is ordered by zipcodes. I will shuffle the dataset, otherwise it would be problematic during the cross validation process, since sklearn's cross validation does not shuffle the dataset. [(Reference)](https://stackoverflow.com/a/55538590)

In [27]:
index = df.index
df = shuffle(df)
df.index = index

## Export clean dataset as csv

In [28]:
df.shape

(8709, 68)

In [29]:
df.head()

Unnamed: 0,zipcode,name,open_now,place_id,price_level,rating,user_ratings_total,searched_keyword,searched_zipcode,location_lat,location_lng,compound_code,global_code,art_gallery,atm,bakery,bar,beauty_salon,bicycle_store,book_store,cafe,car_dealer,car_repair,car_wash,clothing_store,convenience_store,department_store,drugstore,electronics_store,establishment,finance,florist,food,funeral_home,furniture_store,gas_station,general_contractor,grocery_or_supermarket,hair_care,hardware_store,health,home_goods_store,jewelry_store,laundry,liquor_store,local_government_office,locksmith,lodging,meal_delivery,meal_takeaway,movie_rental,night_club,park,parking,pet_store,pharmacy,point_of_interest,premise,restaurant,school,shoe_store,shopping_mall,spa,storage,store,supermarket,tourist_attraction,travel_agency
0,10006,Trinity Place,True,ChIJ4_AAQhdawokRzbYlVLyzKlk,2.0,4.4,1135,restaurant,10006,40.708889,-74.011389,PX5Q+HC New York,87G7PX5Q+HC,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,10312,Villa Monte Pizza,True,ChIJpyGJkxhLwokRYSM5WLPeZWw,2.0,4.0,269,restaurant,10312,40.553698,-74.193031,HR34+FQ New York,87G7HR34+FQ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
2,10016,Starbucks,True,ChIJSX154wdZwokRTMfheONZLkc,2.0,4.2,124,coffee shops,10016,40.745394,-73.982142,P2W9+54 New York,87G8P2W9+54,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
3,10452,Bonanza Electronics,True,ChIJiUjsyz70wokRdKb7AdoayhE,,3.4,22,stores,10452,40.839704,-73.91658,R3QM+V9 New York,87G8R3QM+V9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,21539,215 Street Liquor Corporation,True,ChIJ06WiqdNjwokR_k7rdFW9UvM,,4.0,52,stores,11428,40.717626,-73.739371,P796+37 New York,87G8P796+37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [30]:
#df.to_csv('../data/clean_google_data_nyc.csv', index=False)