# Foot traffic data cleaning process

In [1]:
import pandas as pd
import numpy as np
import os
import data_cleaning_methods
import re

## II. Foursquare Mobility Reports

Data Summary

The data source is from Foursquare. Based on 13 million always-on opted-in users, the dataset includes foot traffic indexes from different human groups in 25 particular places, from February 2020. The name could be formulated as A_B_C, corresponding to the file type of csv, while the shp type is B_C. The A represents granularity, including the scale of state and county. The B represents visits (visit number), avgDuration (average duration) and p50Duration (Median Visit Length, Minutes), which there of them are key indexes ones should collect. The C represents 25 places which enrol in the collection.

In [2]:
pattern ='Food'
df_foursquare=[]

for f in os.listdir('../../data/foot_traffic/source/Foursquare_Community_Mobility_Data/'):
    if pattern in f:
        data=pd.read_csv(os.path.join('../../data/foot_traffic/source/Foursquare_Community_Mobility_Data/',f), index_col=0)
        data['category'] = f[6:-4]
        df_foursquare.append(data)

In [3]:
df_fsquare = pd.concat(df_foursquare, ignore_index=True)

In [4]:
df_fsquare.head(2)

Unnamed: 0,NAME,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08,2020-01-09,...,2020-10-21,2020-10-22,2020-10-23,2020-10-24,2020-10-25,2020-10-26,2020-10-27,2020-10-28,2020-10-29,category
0,Alabama,1300672.0,1441176.0,1729554.0,1871494.0,1734159.0,1517037.0,1566957.0,1572101.0,1616301.0,...,1671856.0,1813200.0,1978832.0,1956772.0,1858730.0,1611078.0,1635177.0,1640566.0,1759420.0,visits_Food
1,Alaska,175497.0,151499.0,184125.0,191059.0,188762.0,162335.0,178630.0,170998.0,174510.0,...,196608.0,183126.0,201748.0,235417.0,183356.0,160133.0,174813.0,177277.0,173179.0,visits_Food


In [5]:
df_fsquare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Columns: 305 entries, NAME to category
dtypes: float64(303), object(2)
memory usage: 743.5+ KB


In [6]:
print("Missing data? {}".format(data_cleaning_methods.missing_bool(df_fsquare)))
abs_missing = data_cleaning_methods.frequency_missing(df_fsquare, 'relative')
print("Absolute number of missing values: {}".format(abs_missing))
col_min, col_max = data_cleaning_methods.missing(df_fsquare)
print("Column with lowest amount of missings contains {} % missings.".format(col_min))
print("Column with highest amount of missings contains {} % missings.\n\n".format(col_max))

Missing data? True
Absolute number of missing values: NAME           0
2020-01-01     6
2020-01-02     6
2020-01-03     6
2020-01-04     6
2020-01-05     6
2020-01-06     6
2020-01-07     6
2020-01-08     6
2020-01-09     6
2020-01-10     6
2020-01-11     6
2020-01-12     6
2020-01-13     6
2020-01-14     6
2020-01-15     6
2020-01-16     6
2020-01-17     6
2020-01-18     6
2020-01-19     6
2020-01-20     6
2020-01-21     6
2020-01-22     6
2020-01-23     6
2020-01-24     6
2020-01-25     6
2020-01-26     6
2020-01-27     6
2020-01-28     6
2020-01-29     6
              ..
2020-10-01    12
2020-10-02    12
2020-10-03    12
2020-10-04    12
2020-10-05    12
2020-10-06    12
2020-10-07    12
2020-10-08    12
2020-10-09    12
2020-10-10    12
2020-10-11    12
2020-10-12    12
2020-10-13    12
2020-10-14    12
2020-10-15    12
2020-10-16    12
2020-10-17    12
2020-10-18    12
2020-10-19    12
2020-10-20    12
2020-10-21    12
2020-10-22    12
2020-10-23    12
2020-10-24    12
2020-10-25 

In [7]:
df_fsquare = df_fsquare[df_fsquare.NAME != 'No State']

In [8]:
df_fsquare[df_fsquare.NAME == 'California'].iloc[:, 250:300]

Unnamed: 0,2020-09-06,2020-09-07,2020-09-08,2020-09-09,2020-09-10,2020-09-11,2020-09-12,2020-09-13,2020-09-14,2020-09-15,...,2020-10-16,2020-10-17,2020-10-18,2020-10-19,2020-10-20,2020-10-21,2020-10-22,2020-10-23,2020-10-24,2020-10-25
4,12508911.0,11689652.0,10697534.0,10730487.0,11004127.0,11670573.0,13391028.0,,,,...,,,,,,,,,,
56,10.0,10.0,10.0,10.0,10.0,10.0,11.0,,,,...,,,,,,,,,,
108,9.0,7.0,8.0,8.0,9.0,9.0,7.0,,,,...,,,,,,,,,,
160,2628846.0,2711562.0,2616416.0,2615460.0,2623346.0,2772674.0,2889489.0,,,,...,,,,,,,,,,
212,8.0,8.0,7.0,7.0,7.0,7.0,8.0,,,,...,,,,,,,,,,
264,6.0,7.0,4.0,4.0,3.0,5.0,3.0,,,,...,,,,,,,,,,


#### Warning! The rest of missing values belongs to California since September 13th

#### Finally, we save this file integrating the six datasets by states without the missing rows

In [9]:
df_fsquare.to_csv('../../data/foot_traffic/interim/foursquare_states.csv', index = False)