# Foot traffic data cleaning process

In [1]:
import pandas as pd
import numpy as np
import os
import data_cleaning_methods
import re

## III. Apple Mobility Reports

Data Summary

The relative amount of route requests for each country/region, subregion, or city compared to the baseline volume on January 13, 2020. The dataset record ranges from Jan 2020 to October 2020.

In [2]:
pattern ='State_US_'
df_list=[]

for f in os.listdir('../../data/foot_traffic/source/Apple_Mobility_Reports/'):
    if pattern in f:
        data=pd.read_csv(os.path.join('../../data/foot_traffic/source/Apple_Mobility_Reports/',f), index_col=0)
        #data['category'] = f.split(pattern)[1][:-4]
        df_list.append(data)

In [3]:
df_apple = pd.concat(df_list, ignore_index=True)

In [4]:
df_apple.head(2)

Unnamed: 0,STATE,NAME,geo_type,transportation_type,alternative_name,sub-region,country,2020-01-13,2020-01-14,2020-01-15,...,2020-10-15,2020-10-16,2020-10-17,2020-10-18,2020-10-19,2020-10-20,2020-10-21,2020-10-22,2020-10-23,2020-10-24
0,1,Alabama,sub-region,transit,,,United States,100.0,105.66,105.66,...,114.57,134.99,133.45,108.24,109.38,101.51,112.59,117.37,128.92,115.23
1,2,Alaska,sub-region,transit,,,United States,100.0,87.49,84.6,...,65.16,67.66,63.91,51.68,71.61,61.89,61.5,59.48,57.27,58.04


In [5]:
df_apple.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Columns: 293 entries, STATE to 2020-10-24
dtypes: float64(288), int64(1), object(4)
memory usage: 350.3+ KB


In [6]:
df_apple.groupby('transportation_type').count()

Unnamed: 0_level_0,STATE,NAME,geo_type,alternative_name,sub-region,country,2020-01-13,2020-01-14,2020-01-15,2020-01-16,...,2020-10-15,2020-10-16,2020-10-17,2020-10-18,2020-10-19,2020-10-20,2020-10-21,2020-10-22,2020-10-23,2020-10-24
transportation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
driving,50,50,50,0,0,50,50,50,50,50,...,50,50,50,50,50,50,50,50,50,50
transit,43,43,43,0,0,43,43,43,43,43,...,43,43,43,43,43,43,43,43,43,43
walking,50,50,50,0,0,50,50,50,50,50,...,50,50,50,50,50,50,50,50,50,50


In [7]:
print("Missing data? {}".format(data_cleaning_methods.missing_bool(df_apple)))
abs_missing = data_cleaning_methods.frequency_missing(df_apple, 'relative')
print("Absolute number of missing values: {}".format(abs_missing))
col_min, col_max = data_cleaning_methods.missing(df_apple)
print("Column with lowest amount of missings contains {} % missings.".format(col_min))
print("Column with highest amount of missings contains {} % missings.\n\n".format(col_max))

Missing data? True
Absolute number of missing values: STATE                    0
NAME                     0
geo_type                10
transportation_type     10
alternative_name       153
sub-region             153
country                 10
2020-01-13              10
2020-01-14              10
2020-01-15              10
2020-01-16              10
2020-01-17              10
2020-01-18              10
2020-01-19              10
2020-01-20              10
2020-01-21              10
2020-01-22              10
2020-01-23              10
2020-01-24              10
2020-01-25              10
2020-01-26              10
2020-01-27              10
2020-01-28              10
2020-01-29              10
2020-01-30              10
2020-01-31              10
2020-02-01              10
2020-02-02              10
2020-02-03              10
2020-02-04              10
                      ... 
2020-09-25              10
2020-09-26              10
2020-09-27              10
2020-09-28              10
2

Drop nan columns `alternative_name` and `sub_region`:

In [8]:
df_apple.dropna(axis='columns', how='all', inplace=True)

In [9]:
df_apple.groupby(['STATE']).count()

Unnamed: 0_level_0,NAME,geo_type,transportation_type,country,2020-01-13,2020-01-14,2020-01-15,2020-01-16,2020-01-17,2020-01-18,...,2020-10-15,2020-10-16,2020-10-17,2020-10-18,2020-10-19,2020-10-20,2020-10-21,2020-10-22,2020-10-23,2020-10-24
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
5,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
6,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
8,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
9,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
10,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
11,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


In [10]:
df_apple[df_apple['STATE'] == 11].NAME

8      District of Columbia
59     District of Columbia
110    District of Columbia
Name: NAME, dtype: object

In [11]:
df_apple[df_apple['STATE'] == 46].NAME

41     South Dakota
92     South Dakota
143    South Dakota
Name: NAME, dtype: object

In [12]:
df_apple[df_apple['STATE'] == 54].NAME

48     West Virginia
99     West Virginia
150    West Virginia
Name: NAME, dtype: object

In [13]:
df_apple[df_apple['STATE'] == 56].NAME

50     Wyoming
101    Wyoming
152    Wyoming
Name: NAME, dtype: object

The four states without or incomplete data are Columbia, South Dakota, West Virginia and Wyoming. In this case, we are not going to delete the states without information. Instead, we'll replace using statistics in the next section (EDA)

#### Finally, we save this file integrating the three datasets by states without the missing rows

In [14]:
df_apple.to_csv('../../data/foot_traffic/interim/apple_states.csv', index = False)