# Foot traffic data cleaning process

In [1]:
import pandas as pd
import numpy as np
import os
import data_cleaning_methods

## V. COVID-19 Foot Traffic Data (Free)

Provided By: Foursquare 
This free data set contains indexed foot traffic to 19 categories of venues. The indexed data is broken out geographically, with included data for National, SF, NYC, LA, and Seattle. The data is normalized against U.S. Census data to remove age, gender and geographical bias. Data is provided daily from 02/19/2020.

In [2]:
path = '../../data/raw/mobility/Foursquare_COVID-19_National_Regional/'

In [3]:
xls_reg = pd.ExcelFile(path+'Foursquare_COVID-19_Data_National_Regional_November_20_2020.xlsx')
xls_dma = pd.ExcelFile(path+'Foursquare_COVID-19_Data_National_DMA_November_20_2020.xlsx')

In [4]:
xls_dma.sheet_names

['National Summary',
 'Airports',
 'Auto Dealerships',
 'Auto Shops',
 'Banks',
 'Bars',
 'Big Box Stores',
 'Casual Dining Chains',
 'Clothing Stores',
 'Convenience Store',
 'Drugstores & Pharmacies',
 'Discount Stores',
 'Fast Food',
 'Furniture Stores',
 'Gas Stations',
 'Grocery Stores',
 'Gyms',
 'Hardware Stores',
 'Hotels',
 'Movie Theaters',
 'Nail Salons',
 'Liquor Stores',
 'Offices',
 'Pet Stores',
 'Parks',
 'SalonsBarbershops',
 'Shopping Malls',
 'Trails',
 'Warehouse Stores',
 'Wireless Carriers']

### DMA level

In [5]:
list_categories = ['National', 'New', 'Los', 'Seattle', 'San', '*Insufficient']

In [6]:
df_cities = pd.DataFrame()
df_summary = pd.DataFrame()

for sheet_name in xls_dma.sheet_names:
    if sheet_name != 'National Summary':
        temp = pd.read_excel(xls_dma, sheet_name=sheet_name, index_col=0)
        for col in temp.columns:
            for cat in list_categories:
                if col.startswith(cat):
                    temp.rename(columns={col:cat}, inplace=True)
        temp['class'] = sheet_name
        df_cities = pd.concat([df_cities, temp], sort=True)
    else:
        temp = pd.read_excel(xls_dma, sheet_name=sheet_name, index_col=0)
        temp['class'] = sheet_name
        df_summary = pd.concat([df_summary, temp], sort=True)
        
#changing name of columns
list_name = ['National', 'New York', 'Los Angeles', 'Seattle-Tacoma', 'SanFrancisco-Oakland-SanJose', '*Insufficient']
for col in df_cities.columns:
    for name in list_name:
        if name.startswith(col):
            df_cities.rename(columns={col:name}, inplace=True)

In [7]:
df_cities.head(2)

Unnamed: 0,*Insufficient,Los Angeles,National,New York,SanFrancisco-Oakland-SanJose,Seattle-Tacoma,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,class
,,,,,,,,,,,,
2020-02-19,,100.0,100.0,100.0,100.0,,,,,,,Airports
2020-02-20,,99.08446,99.149526,99.094944,97.9394,,,,,,,Airports


In [8]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8004 entries, 2020-02-19 to 2020-11-20
Data columns (total 12 columns):
*Insufficient                   0 non-null float64
Los Angeles                     5773 non-null float64
National                        8004 non-null float64
New York                        6169 non-null float64
SanFrancisco-Oakland-SanJose    3336 non-null float64
Seattle-Tacoma                  2759 non-null float64
Unnamed: 2                      0 non-null float64
Unnamed: 3                      0 non-null float64
Unnamed: 4                      0 non-null float64
Unnamed: 5                      0 non-null float64
Unnamed: 6                      0 non-null float64
class                           8004 non-null object
dtypes: float64(11), object(1)
memory usage: 812.9+ KB


Delete columns with all missing values:

In [9]:
df_cities.dropna(axis=1, how='all', inplace=True)

In [10]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8004 entries, 2020-02-19 to 2020-11-20
Data columns (total 6 columns):
Los Angeles                     5773 non-null float64
National                        8004 non-null float64
New York                        6169 non-null float64
SanFrancisco-Oakland-SanJose    3336 non-null float64
Seattle-Tacoma                  2759 non-null float64
class                           8004 non-null object
dtypes: float64(5), object(1)
memory usage: 437.7+ KB


In [11]:
data_cleaning_methods.percent_missing(df_cities)

Los Angeles                     27.873563
National                         0.000000
New York                        22.926037
SanFrancisco-Oakland-SanJose    58.320840
Seattle-Tacoma                  65.529735
class                            0.000000
dtype: float64

The cities with less missing values are Los Angeles and New York. As opposite, Seattle has 65.5% of missing data. On a granular level, grouping by class, the information aboout indexed foot traffic in Fast Food Restaurants is complete nationally and across all the cities, but the casual dining are incomplete.  

In [12]:
df_cities.groupby('class').count()

Unnamed: 0_level_0,Los Angeles,National,New York,SanFrancisco-Oakland-SanJose,Seattle-Tacoma
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Airports,276,276,276,276,0
Auto Dealerships,0,276,0,0,0
Auto Shops,276,276,276,0,0
Banks,276,276,276,38,38
Bars,276,276,276,276,276
Big Box Stores,276,276,276,38,38
Casual Dining Chains,80,276,59,38,38
Clothing Stores,276,276,276,38,38
Convenience Store,276,276,276,276,276
Discount Stores,276,276,276,185,185


Inspecting the summary dataframe, we conclude that there is not missing values there.

In [13]:
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 276 entries, 2020-02-19 to 2020-11-20
Data columns (total 31 columns):
Airports                     276 non-null float64
Auto Dealerships             276 non-null float64
Automotive Shops             276 non-null float64
Banks                        276 non-null float64
Bars                         276 non-null float64
Big Box Stores               276 non-null float64
Casual Dining                276 non-null float64
Clothing Stores              276 non-null float64
Convenience Stores           276 non-null float64
Discount Stores              276 non-null float64
Fast Food Restaurants        276 non-null float64
Furniture / Home Stores      276 non-null float64
Gas Stations                 276 non-null float64
Grocery Stores               276 non-null float64
Gyms or Fitness Centers      276 non-null float64
Hardware Stores              276 non-null float64
Hotels                       276 non-null float64
Liquor Stores             

### Regional level

In [14]:
xls_reg.sheet_names

['National Summary',
 'Airports',
 'Auto Dealerships',
 'Auto Shops',
 'Banks',
 'Bars',
 'Big Box Stores',
 'Casual Dining Chains',
 'Clothing Stores',
 'Convenience Stores',
 'Discount Stores',
 'Drugstores & Pharmacies',
 'Fast Food',
 'Furniture Stores',
 'Gas Stations',
 'Grocery Stores',
 'Gyms',
 'Hardware Stores',
 'Hotels',
 'Liquor Stores',
 'Movie Theaters',
 'Nail Salons',
 'Offices',
 'Parks',
 'Pet Stores',
 'SalonsBarbers',
 'Shopping Malls',
 'Trails',
 'Veterinarians',
 'Warehouse Stores',
 'Wireless Carriers']

Since `National Summary` data sheets are the same in both list of sheets (`xls_reg` and `xls_dma`), we skip the `National Summary` sheet:

In [15]:
df_region = pd.DataFrame()

for sheet_name in xls_reg.sheet_names:
    if sheet_name != 'National Summary':
        temp = pd.read_excel(xls_reg, sheet_name=sheet_name, index_col=0)
        for col in temp.columns:
            for cat in list_categories:
                if col.startswith(cat):
                    temp.rename(columns={col:cat}, inplace=True)
        temp['class'] = sheet_name
        df_region = pd.concat([df_region, temp], sort=True)
        
#changing name of columns
list_name = ['National', 'New York', 'Los Angeles', 'Seattle-Tacoma', 'SanFrancisco-Oakland-SanJose', '*Insufficient']
for col in df_region.columns:
    for name in list_name:
        if name.startswith(col):
            df_region.rename(columns={col:name}, inplace=True)

In [16]:
df_region.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8280 entries, 2020-02-19 to 2020-11-20
Data columns (total 7 columns):
*Insufficient    0 non-null float64
Midwest          8280 non-null float64
Northeast        8004 non-null float64
South            8280 non-null float64
Unnamed: 5       0 non-null float64
West             8077 non-null float64
class            8280 non-null object
dtypes: float64(6), object(1)
memory usage: 517.5+ KB


In [17]:
df_region.dropna(axis=1, how='all', inplace=True)

In [18]:
df_region.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8280 entries, 2020-02-19 to 2020-11-20
Data columns (total 5 columns):
Midwest      8280 non-null float64
Northeast    8004 non-null float64
South        8280 non-null float64
West         8077 non-null float64
class        8280 non-null object
dtypes: float64(4), object(1)
memory usage: 388.1+ KB


In [19]:
data_cleaning_methods.percent_missing(df_region)

Midwest      0.000000
Northeast    3.333333
South        0.000000
West         2.451691
class        0.000000
dtype: float64

The regional level has two columns with incomplete data lower than 3.3%.

Finally, saving the dataframes as csv in the interim folder, 

In [20]:
df_cities.to_csv('../../data/interim/mobility/foursquare_dma.csv')
df_summary.to_csv('../../data/interim/mobility/foursquare_national.csv')
df_region.to_csv('../../data/interim/mobility/foursquare_regional.csv')