# 1. Social Distance Data (Mobility Data)
* Provided by Maryland Transportation Institute and Center for Advanced Transportation Technology Laboratory at the University of Maryland
* Access through this link: https://data.bts.gov/Research-and-Statistics/Trips-by-Distance/w96p-f2qv
* Downloaded data would be stored in ```data/distance_data/``` directory

In [1]:
!pip3 install sodapy
!pip3 install glob3
!pip install tqdm
!pip install --upgrade numpy

Collecting numpy
  Using cached https://files.pythonhosted.org/packages/46/09/1bae812d4afa67e365d3d1dbdc0e9071ba7678611f52b49353d6104ae8ff/numpy-1.19.4-cp37-cp37m-macosx_10_9_x86_64.whl
[31mERROR: senta 2.0.0 has requirement numpy==1.14.5, but you'll have numpy 1.19.4 which is incompatible.[0m
Installing collected packages: numpy
  Found existing installation: numpy 1.14.5
    Uninstalling numpy-1.14.5:
      Successfully uninstalled numpy-1.14.5
Successfully installed numpy-1.19.4


In [2]:
# Package for downloading the data from the specific API
from sodapy import Socrata 
from glob import glob

In [3]:
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import tqdm

import datetime

import matplotlib.pyplot as plt
from matplotlib import dates

In [4]:
# create or pass in the folder path that stores the data files
# Would take a long time to run 
data_folder_path = 'data/distance_data/'

client = Socrata("data.bts.gov", 'CgejIICiuJS7QETgHZiYeE04C')

def get_all_data(start = 0, end = 2139980, step = 50000):
    """This function downloads the whole data in 43 separate files"""
    print('Percentage downloaded:')
    for i in range((end - start)//step+1):
        offset = i*step
        print(round(offset/end,3),end=',')
        one_part_df = pd.DataFrame.from_records(client.get("w96p-f2qv", limit = step, offset = offset))
        one_part_df.to_csv(data_folder_path+'data_'+str(offset).zfill(7)+'.csv',index=False)

get_all_data()


Percentage downloaded:
0.0,0.023,0.047,0.07,0.093,0.117,0.14,0.164,0.187,0.21,0.234,0.257,0.28,0.304,0.327,0.35,0.374,0.397,0.421,0.444,0.467,0.491,0.514,0.537,0.561,0.584,0.607,0.631,0.654,0.678,0.701,0.724,0.748,0.771,0.794,0.818,0.841,0.864,0.888,0.911,0.935,0.958,0.981,

In [5]:

data_folder_path = 'data/distance_data/'

def aggregate_data_files(data_folder_path = data_folder_path):
    """This function combines the 43 data files and returns a single dataframe"""
    df = pd.DataFrame()
    paths = sorted(glob(data_folder_path+'data_*'))
    for i in range(len(paths)):
        if i%5 == 0:
            print(round(i/len(paths),3),end=', ')
        df = df.append(pd.read_csv(paths[i]),ignore_index=True)
    return df

df = aggregate_data_files()

0.0, 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


0.116, 0.233, 0.349, 0.465, 0.581, 0.698, 0.814, 0.93, 

In [6]:
print(df.columns)
print(df.shape)

Index(['county', 'county_fips', 'date', 'level', 'pop_not_stay_at_home',
       'pop_stay_at_home', 'state_code', 'state_fips', 'trips', 'trips_1',
       'trips_100_250', 'trips_10_25', 'trips_1_3', 'trips_250_500',
       'trips_25_50', 'trips_3_5', 'trips_500', 'trips_50_100', 'trips_5_10'],
      dtype='object')
(2150000, 19)


In [7]:
df.head(1)

Unnamed: 0,county,county_fips,date,level,pop_not_stay_at_home,pop_stay_at_home,state_code,state_fips,trips,trips_1,trips_100_250,trips_10_25,trips_1_3,trips_250_500,trips_25_50,trips_3_5,trips_500,trips_50_100,trips_5_10
0,Putnam County,29171.0,2019-01-01T00:00:00.000,County,3587.0,1155.0,MO,29.0,12429.0,2807.0,101.0,1953.0,3642.0,54.0,1058.0,1272.0,19.0,283.0,1240.0


In [8]:
df["only_date"] = df["date"].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%f').strftime('%m/%d'))
df["weekday"] = pd.to_datetime(df["date"]).dt.weekday

In [9]:
df["date"] = pd.to_datetime(df["date"])
df.describe()

Unnamed: 0,county_fips,pop_not_stay_at_home,pop_stay_at_home,state_fips,trips,trips_1,trips_100_250,trips_10_25,trips_1_3,trips_250_500,trips_25_50,trips_3_5,trips_500,trips_50_100,trips_5_10,weekday
count,2115160.0,2126992.0,2126992.0,2149330.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2126992.0,2150000.0
mean,30377.65,242601.1,66549.04,30.2532,1133885.0,277186.5,7253.096,173343.2,284648.0,1669.321,55866.95,138960.4,1526.471,17730.42,175700.5,2.996802
std,15162.42,4655739.0,1291326.0,15.15376,22082980.0,5389963.0,142343.5,3402088.0,5553474.0,32927.14,1085626.0,2712088.0,34143.06,342085.1,3437073.0,1.99973
min,1001.0,-38.0,8.0,1.0,220.0,0.0,0.0,0.0,0.0,0.0,0.0,-108.0,0.0,0.0,0.0,0.0
25%,18175.0,9293.0,2138.0,18.0,42601.0,9053.0,371.0,6777.0,9590.0,56.0,3352.0,4049.0,18.0,1168.0,5549.0,1.0
50%,29175.0,21848.0,5056.0,29.0,102420.0,22075.0,875.0,16203.0,25433.0,167.0,7233.0,11647.0,67.0,2574.0,14339.0,3.0
75%,45081.0,59565.0,14861.0,45.0,282189.2,62283.0,2231.0,42986.0,73280.0,472.0,17142.0,35156.0,258.0,5958.0,41887.0,5.0
max,56045.0,273740000.0,110211800.0,56.0,1569053000.0,422700200.0,14476980.0,256509600.0,405130500.0,3651375.0,76367320.0,198018400.0,5003062.0,25539740.0,252611800.0,6.0


In [10]:
start_date1 = datetime.datetime(2019,1,1)
end_date1 = datetime.datetime(2019,10,31)
df_2019 = df[df['date'] >= start_date1]
df_2019 = df_2019[df_2019["date"] <= end_date1]

start_date2 = datetime.datetime(2020,1,1)
df_2020 = df[df['date'] >= start_date2]

In [11]:
national_2019 = df_2019[df_2019["level"] == "National"].reset_index(drop = True)
state_2019 = df_2019[df_2019["level"] == "State"].reset_index(drop = True)
county_2019 = df_2019[df_2019["level"] == "County"].reset_index(drop = True)

national_2020 = df_2020[df_2020["level"] == "National"].reset_index(drop = True)
state_2020 = df_2020[df_2020["level"] == "State"].reset_index(drop = True)
county_2020 = df_2020[df_2020["level"] == "County"].reset_index(drop = True)

In [12]:
state_names = state_2020["state_code"].unique()
print(len(state_names))

51


In [13]:
data_all = [[national_2019, state_2019, county_2019],
            [national_2020, state_2020, county_2020]]

In [14]:
# save cleaned data
data_folder_path = 'data/distance_data/'
data_names = ["national_2019", "state_2019", "county_2019",
             "national_2020", "state_2020", "county_2020"]
i = 0
for year in data_all:
    for data in year:
        data['pop'] = data['pop_stay_at_home'] + data['pop_not_stay_at_home']
        data['avg_stay_at_home_ratio'] = data['pop_stay_at_home']/data['pop']
        if (i == 0) or (i == 3):
            data['MA_7'] = data['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())

        elif (i == 1) or (i == 4):
            # state level
            data['MA_7'] = data.groupby("state_code")['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())
        else:
            # county level
            data['MA_7'] = data.groupby("county_fips")['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())

        data.to_csv(data_folder_path+"%s.csv"%(data_names[i]), index = False)
        i += 1

# 2. Covid Cases

we got the aggregated dataset by JHU as our primary source of Covid facts. 

- Source link: https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/
- Manually downloaded data is stored in data/covid_cases/ directory


# 3. County-level Social and Economics Data

We retrieved social, economic, housing, and demographic features of each county from the US census data source. 

- Source link: 
    - eco data：https://data.census.gov/api/access/table/download?download_id=YvldrnUBoWMSCEcxoaag
    - social data：https://data.census.gov/api/access/table/download?download_id=rI0EHXUB38sLX1nVBjAC
- Manually downloaded data is stored in ```data/social_economics/``` directory


# 4. Policy Data

We collected information about control policies from KFF and the Berkeley group. 

- Source link: 
    - KFF: https://github.com/KFFData/COVID-19-Data/tree/kff_master/State%20Policy%20Actions/State%20Social%20Distancing%20Actions
    - Berkeley group: https://github.com/covidvis/covid19-vis/tree/master/data
    
- Manually downloaded data is stored in ```data/policy_data/``` directory

