# Social Distance Data
* Provided by Maryland Transportation Institute and Center for Advanced Transportation Technology Laboratory at the University of Maryland
* Access through this link: https://data.bts.gov/Research-and-Statistics/Trips-by-Distance/w96p-f2qv

In [4]:
!pip3 install sodapy
!pip3 install glob3
!pip install tqdm



In [1]:
from sodapy import Socrata# 
from glob import glob

In [2]:
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import tqdm

import datetime

import matplotlib.pyplot as plt
from matplotlib import dates
import plotly
import plotly.figure_factory as ff

In [3]:
# create or pass in the folder path that stores the data files
data_folder_path = '../data/distance_data/'

client = Socrata("data.bts.gov", 'CgejIICiuJS7QETgHZiYeE04C')

def get_all_data(start = 0, end = 2139980, step = 50000):
    """This function downloads the whole data in 43 separate files"""
    print('Percentage downloaded:')
    for i in range((end - start)//step+1):
        offset = i*step
        print(round(offset/end,3),end=',')
        one_part_df = pd.DataFrame.from_records(client.get("w96p-f2qv", limit = step, offset = offset))
        one_part_df.to_csv(data_folder_path+'data_'+str(offset).zfill(7)+'.csv',index=False)

get_all_data()


Percentage downloaded:
0.0,

KeyboardInterrupt: 

In [4]:

data_folder_path = '../data/distance_data/'

def aggregate_data_files(data_folder_path = data_folder_path):
    """This function combines the 43 data files and returns a single dataframe"""
    df = pd.DataFrame()
    paths = sorted(glob(data_folder_path+'data_*'))
    for i in range(len(paths)):
        if i%5 == 0:
            print(round(i/len(paths),3),end=', ')
        df = df.append(pd.read_csv(paths[i]),ignore_index=True)
    return df

df = aggregate_data_files()

0.0, 


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





0.116, 0.233, 0.349, 0.465, 0.581, 0.698, 0.814, 0.93, 

In [5]:
print(df.columns)
print(df.shape)

Index(['county', 'county_fips', 'date', 'level', 'pop_not_stay_at_home',
       'pop_stay_at_home', 'state_code', 'state_fips', 'trips', 'trips_1',
       'trips_100_250', 'trips_10_25', 'trips_1_3', 'trips_250_500',
       'trips_25_50', 'trips_3_5', 'trips_500', 'trips_50_100', 'trips_5_10'],
      dtype='object')
(2139980, 19)


In [6]:
df.head(1)

Unnamed: 0,county,county_fips,date,level,pop_not_stay_at_home,pop_stay_at_home,state_code,state_fips,trips,trips_1,trips_100_250,trips_10_25,trips_1_3,trips_250_500,trips_25_50,trips_3_5,trips_500,trips_50_100,trips_5_10
0,Putnam County,29171.0,2019-01-01T00:00:00.000,County,3587.0,1155.0,MO,29.0,12429.0,2807.0,101.0,1953.0,3642.0,54.0,1058.0,1272.0,19.0,283.0,1240.0


In [7]:
df["only_date"] = df["date"].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%f').strftime('%m/%d'))
df["weekday"] = pd.to_datetime(df["date"]).dt.weekday

In [8]:
df["date"] = pd.to_datetime(df["date"])
df.describe()

Unnamed: 0,county_fips,pop_not_stay_at_home,pop_stay_at_home,state_fips,trips,trips_1,trips_100_250,trips_10_25,trips_1_3,trips_250_500,trips_25_50,trips_3_5,trips_500,trips_50_100,trips_5_10,weekday
count,2105140.0,2116999.0,2116999.0,2139310.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2116999.0,2139980.0
mean,30383.65,243371.3,66726.73,30.259,1137762.0,278145.0,7276.928,173950.0,285597.7,1675.331,56054.88,139433.9,1532.984,17785.18,176309.9,3.0
std,15160.1,4666669.0,1294350.0,15.15153,22134850.0,5402619.0,142678.3,3410082.0,5566518.0,33004.56,1088177.0,2718459.0,34223.39,342889.2,3445148.0,1.996266
min,1001.0,-38.0,8.0,1.0,220.0,0.0,0.0,0.0,0.0,0.0,0.0,-108.0,0.0,0.0,0.0,0.0
25%,18177.0,9297.0,2137.0,18.0,42655.5,9070.0,371.0,6785.0,9600.0,56.0,3355.0,4054.0,18.0,1168.0,5557.0,1.0
50%,29176.0,21857.0,5052.0,29.0,102567.0,22113.0,875.0,16226.0,25459.0,167.0,7240.0,11662.0,68.0,2574.0,14360.0,3.0
75%,45081.0,59596.0,14850.0,45.0,282579.5,62386.5,2232.0,43051.0,73359.5,472.0,17162.0,35204.0,259.0,5959.0,41951.5,5.0
max,56045.0,273740000.0,110211800.0,56.0,1569053000.0,422700200.0,14476980.0,256509600.0,405130500.0,3651375.0,76367320.0,198018400.0,5003062.0,25539740.0,252611800.0,6.0


In [9]:
start_date1 = datetime.datetime(2019,1,1)
end_date1 = datetime.datetime(2019,10,31)
df_2019 = df[df['date'] >= start_date1]
df_2019 = df_2019[df_2019["date"] <= end_date1]

start_date2 = datetime.datetime(2020,1,1)
df_2020 = df[df['date'] >= start_date2]

In [10]:
national_2019 = df_2019[df_2019["level"] == "National"].reset_index(drop = True)
state_2019 = df_2019[df_2019["level"] == "State"].reset_index(drop = True)
county_2019 = df_2019[df_2019["level"] == "County"].reset_index(drop = True)

national_2020 = df_2020[df_2020["level"] == "National"].reset_index(drop = True)
state_2020 = df_2020[df_2020["level"] == "State"].reset_index(drop = True)
county_2020 = df_2020[df_2020["level"] == "County"].reset_index(drop = True)

In [11]:
state_names = state_2020["state_code"].unique()
print(len(state_names))

51


In [12]:
data_all = [[national_2019, state_2019, county_2019],
            [national_2020, state_2020, county_2020]]

In [13]:
# save cleaned data
data_folder_path = '../data/distance_data/'
data_names = ["national_2019", "state_2019", "county_2019",
             "national_2020", "state_2020", "county_2020"]
i = 0
for year in data_all:
    for data in year:
        data['pop'] = data['pop_stay_at_home'] + data['pop_not_stay_at_home']
        data['avg_stay_at_home_ratio'] = data['pop_stay_at_home']/data['pop']
        if (i == 0) or (i == 3):
            data['MA_7'] = data['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())

        elif (i == 1) or (i == 4):
            # state level
            data['MA_7'] = data.groupby("state_code")['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())
        else:
            # county level
            data['MA_7'] = data.groupby("county_fips")['avg_stay_at_home_ratio'].transform(lambda x: x.rolling(7, 1).mean())

        data.to_csv(data_folder_path+"%s.csv"%(data_names[i]), index = False)
        i += 1

ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'