# Inspect data

This notebook performs:
- Data missingness checking
- Data imbalance investigation
- Data scaling 

In [25]:
import random
random.seed(109)

from pprint import pprint

import os
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

## Business data

Contains business data including location data, attributes, and categories.

In [10]:
busi_df = pd.read_json('data/yelp_academic_dataset_business.json', 
                       orient = "records", lines = True)
print(f"business data shape: {busi_df.shape}")

business data shape: (150346, 14)


In [11]:
busi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [12]:
busi_df.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


## Review data

Contains full review text data including the user_id that wrote the review and the business_id the review is written for.

In [13]:
review_df = pd.read_feather('data/yelp_review.feather')
print(f"review data shape: {review_df.shape}")

review data shape: (6990280, 9)


In [14]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [15]:
review_df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30


## Data missingness

In [18]:
# Count the number of missing values for each column
busi_df.isnull().sum()

business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64

Only 3 columns have missing values: `attributes`, `categories`, and `hours`. 



The `attributes` feature contains an object (business attributes to values). Examples of `attributes` values:

In [29]:
pprint(list(busi_df.sample(3)['attributes']))

[{'BikeParking': 'True',
  'BusinessAcceptsCreditCards': 'True',
  'BusinessParking': "{'garage': False, 'street': True, 'validated': False, "
                     "'lot': True, 'valet': False}",
  'ByAppointmentOnly': 'False',
  'DogsAllowed': 'False',
  'GoodForKids': 'True',
  'RestaurantsPriceRange2': '2',
  'WheelchairAccessible': 'True'},
 {'BusinessAcceptsBitcoin': 'False',
  'BusinessAcceptsCreditCards': 'True',
  'ByAppointmentOnly': 'True'},
 {'Alcohol': "u'beer_and_wine'",
  'BikeParking': 'True',
  'BusinessAcceptsBitcoin': 'False',
  'BusinessAcceptsCreditCards': 'True',
  'BusinessParking': "{'garage': False, 'street': True, 'validated': False, "
                     "'lot': True, 'valet': False}",
  'Caters': 'True',
  'DogsAllowed': 'True',
  'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, "
                 "'dinner': False, 'brunch': False, 'breakfast': False}",
  'HappyHour': 'True',
  'HasTV': 'False',
  'OutdoorSeating': 'True',
  'Restaurant

Expanding the `attributes` feature into a flat table using `pd.json_normalize()` yields a table of 39 columns. 

In [33]:
busi_attr_df = pd.json_normalize(busi_df['attributes'])
print(f"attributes flat table shape: {busi_attr_df.shape}")
busi_attr_df.head(3)

attributes flat table shape: (150346, 39)


Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,BikeParking,RestaurantsPriceRange2,CoatCheck,RestaurantsTakeOut,RestaurantsDelivery,Caters,WiFi,BusinessParking,...,AcceptsInsurance,BestNights,BYOB,Corkage,BYOBCorkage,HairSpecializesIn,Open24Hours,RestaurantsCounterService,AgesAllowed,DietaryRestrictions
0,True,,,,,,,,,,...,,,,,,,,,,
1,,True,,,,,,,,,...,,,,,,,,,,
2,False,True,True,2.0,False,False,False,False,u'no',"{'garage': False, 'street': False, 'validated'...",...,,,,,,,,,,


In [27]:
busi_df['categories']

0         Doctors, Traditional Chinese Medicine, Naturop...
1         Shipping Centers, Local Services, Notaries, Ma...
2         Department Stores, Shopping, Fashion, Home & G...
3         Restaurants, Food, Bubble Tea, Coffee & Tea, B...
4                                 Brewpubs, Breweries, Food
                                ...                        
150341                           Nail Salons, Beauty & Spas
150342    Pets, Nurseries & Gardening, Pet Stores, Hobby...
150343    Shopping, Jewelry, Piercing, Toy Stores, Beaut...
150344    Fitness/Exercise Equipment, Eyewear & Optician...
150345    Beauty & Spas, Permanent Makeup, Piercing, Tattoo
Name: categories, Length: 150346, dtype: object

## Downsize the Data (selecting Eastern Time Zone States)

Since the datasets are extremely large, we scope our study to a smaller sample. The data we will use for the rest of the project is selected based on locations (i.e. states). 

In [16]:
busi_df['state'].value_counts()

PA     34039
FL     26330
TN     12056
IN     11247
MO     10913
LA      9924
AZ      9912
NJ      8536
NV      7715
AB      5573
CA      5203
ID      4467
DE      2265
IL      2145
TX         4
CO         3
WA         2
HI         2
MA         2
NC         1
UT         1
MT         1
MI         1
SD         1
XMS        1
VI         1
VT         1
Name: state, dtype: int64

In [17]:
# Only US datasets
df = df.loc[(df['state'] == 'MI') | (df['state'] == 'IN') | 
            (df['state'] == 'TN') | (df['state'] == 'FL') | 
            (df['state'] == 'NC') | (df['state'] == 'DE') | 
            (df['state'] == 'NJ') | (df['state'] == 'PA') | 
            (df['state'] == 'MA')]

NameError: name 'df' is not defined

In [None]:
# Drop column hours because we will be investigating on NLP
df = df.drop(columns = 'hours')
df.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food"


## Dealing with Missing values

In [36]:
# Count the number of Data missing values
df.isnull().sum()

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      8512
categories        68
dtype: int64

In [40]:
# Replace non-numeric variables with a string None
df.attributes = df.attributes.fillna('None')
df.categories = df.categories.fillna('None')

In [41]:
# Check on the final missing values
df.isnull().sum()

business_id     0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
is_open         0
attributes      0
categories      0
dtype: int64

### Add new column: num_categories 
#### This is to numerize the column categories

In [42]:
# Make new column: num_categories
def num_categories (row):
    categories = row['categories']
    num_categories = len(categories.split(', '))
    return num_categories

In [43]:
df['num_categories'] = df.apply(num_categories, axis=1)
df.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,num_categories
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",5
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",3


## Create Separate Attribute Column 
    This is because attributes is written in dictionary form