<a href="https://colab.research.google.com/github/AnjaDeric/MDA-TeamCroatia/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import date, timedelta
pd.options.mode.chained_assignment = None

# 1 Geographical County/State Data

Load the original info dataset from the Johns Hopkins GitHub page. This data includes all counties (including the US and all US territories). It will thus be filtered to extract only the necessary US state and county data.

In [5]:
url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
geo_info = pd.read_csv(url)
geo_info.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0
1,8,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania,2877800.0
2,10,AQ,ATA,10.0,,,,Antarctica,-71.9499,23.347,Antarctica,
3,12,DZ,DZA,12.0,,,,Algeria,28.0339,1.6596,Algeria,43851043.0
4,20,AD,AND,20.0,,,,Andorra,42.5063,1.5218,Andorra,77265.0


## 1.1 Prepare State-Level Data

From the full data set, extract only US state data (excluding all US territories and similar). Only keep relevant columns.

In [6]:
state_info = geo_info.iloc[999:1050]
state_info = state_info[['FIPS','Province_State','Lat','Long_','Population']]
state_info.rename(columns = {'FIPS':'fips', 'Province_State':'state',
                      'Lat':'lat', 'Long_':'long', 'Population':'population'}, inplace = True)
state_info.head()

Unnamed: 0,fips,state,lat,long,population
999,1.0,Alabama,32.3182,-86.9023,4903185.0
1000,2.0,Alaska,61.3707,-152.4044,731545.0
1001,4.0,Arizona,33.7298,-111.4312,7278717.0
1002,5.0,Arkansas,34.9697,-92.3731,3017804.0
1003,6.0,California,36.1162,-119.6816,39512223.0


Reformat FIPS to be a 2 character string.

In [7]:
state_info.fips = (state_info.fips.astype(int)).astype(object)
state_info['fips'] = state_info['fips'].apply('{:0>2}'.format)
state_info.head()

Unnamed: 0,fips,state,lat,long,population
999,1,Alabama,32.3182,-86.9023,4903185.0
1000,2,Alaska,61.3707,-152.4044,731545.0
1001,4,Arizona,33.7298,-111.4312,7278717.0
1002,5,Arkansas,34.9697,-92.3731,3017804.0
1003,6,California,36.1162,-119.6816,39512223.0


Keep data only for continental US -- that is, remove Hawaii and Alaska from the data set.

In [8]:
state_info = state_info[(state_info.state != "Alaska") & (state_info.state != "Hawaii")]
state_info.reset_index(inplace=True, drop = True)
state_info.head()

Unnamed: 0,fips,state,lat,long,population
0,1,Alabama,32.3182,-86.9023,4903185.0
1,4,Arizona,33.7298,-111.4312,7278717.0
2,5,Arkansas,34.9697,-92.3731,3017804.0
3,6,California,36.1162,-119.6816,39512223.0
4,8,Colorado,39.0598,-105.3111,5758736.0


## 1.2 Prepare County-Level Data

From the full data set, extract only data for US counties.

In [9]:
county_info = geo_info.iloc[1173:4318]
county_info.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
1173,84001001,US,USA,840.0,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869.0
1174,84001003,US,USA,840.0,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",223234.0
1175,84001005,US,USA,840.0,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",24686.0
1176,84001007,US,USA,840.0,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",22394.0
1177,84001009,US,USA,840.0,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",57826.0


In [10]:
county_info = county_info[['FIPS','Admin2','Province_State','Lat','Long_','Population']]
county_info.rename(columns = {'FIPS':'fips', 'Province_State':'state', 'Admin2':'county',
                      'Lat':'lat', 'Long_':'long', 'Population':'population'}, inplace = True)
county_info.head()

Unnamed: 0,fips,county,state,lat,long,population
1173,1001.0,Autauga,Alabama,32.539527,-86.644082,55869.0
1174,1003.0,Baldwin,Alabama,30.72775,-87.722071,223234.0
1175,1005.0,Barbour,Alabama,31.868263,-85.387129,24686.0
1176,1007.0,Bibb,Alabama,32.996421,-87.125115,22394.0
1177,1009.0,Blount,Alabama,33.982109,-86.567906,57826.0


In [11]:
county_info.fips = (county_info.fips.astype(int)).astype(object)
county_info['fips'] = county_info['fips'].apply('{:0>5}'.format)
county_info.head()

Unnamed: 0,fips,county,state,lat,long,population
1173,1001,Autauga,Alabama,32.539527,-86.644082,55869.0
1174,1003,Baldwin,Alabama,30.72775,-87.722071,223234.0
1175,1005,Barbour,Alabama,31.868263,-85.387129,24686.0
1176,1007,Bibb,Alabama,32.996421,-87.125115,22394.0
1177,1009,Blount,Alabama,33.982109,-86.567906,57826.0


In [12]:
county_info = county_info[(county_info.state != "Alaska") & (county_info.state != "Hawaii")]
county_info.reset_index(inplace=True, drop = True)
county_info.head()

Unnamed: 0,fips,county,state,lat,long,population
0,1001,Autauga,Alabama,32.539527,-86.644082,55869.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826.0


Quick check to make sure all states in state_info appear in county_info and vice-versa (no extra states/counties).

In [13]:
state_info['state'].unique() == county_info['state'].unique()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

## 1.3 Export State and County General Data


In [14]:
state_info.to_csv('state_info.csv', index=False)
county_info.to_csv('county_info.csv', index=False)

# 2 COVID Data

As mentioned, we will be getting our COVID data from the Johns Hopkins GitHub database. In this database, each day has its own dedicated csv file named in the mm-dd-yyy format.In order to access the data, we will first generate a list of dates that we want to collect data for and that can later be incorporated into the URL we will be reading from.

Note: A week of data from 2020 will be included in the data set as we will later on be converting cumulative cases into active cases.

In [28]:
dates = pd.date_range(date(2020,12,25),date(2022,1,1)-timedelta(days=1),freq='d').strftime('%m-%d-%Y')
dates

Index(['12-25-2020', '12-26-2020', '12-27-2020', '12-28-2020', '12-29-2020',
       '12-30-2020', '12-31-2020', '01-01-2021', '01-02-2021', '01-03-2021',
       ...
       '12-22-2021', '12-23-2021', '12-24-2021', '12-25-2021', '12-26-2021',
       '12-27-2021', '12-28-2021', '12-29-2021', '12-30-2021', '12-31-2021'],
      dtype='object', length=372)

## 2.1 Get COVID data

In this section, we get data on confirmed cases from the Johns Hopkings GitHub repository by accessing and storing data for each day of 2021.

Note: To skip running this code (and avoid waiting), run the first cell in Section 2.2, which will get the data gathered in Section 2.1 from our own GitHub repository.

### 2.1.1 DataFrame Set Up

To store the data, we will start with the base county_info dataframe. Then, we will individually load the data file for each date, extract the data that we need, and add it to the county_info data frame. In the end, county_info will contain all the county-specific demographic/geo data, as well as all the covid data.

Note: Johns Hopkins updates their COVID data base daily, sometimes including the FIPS file used in the first section. For that reason, we will be loading the county_info data from the final file generated in the previous section and stored in our own GitHub repository.

In [30]:
url = 'https://raw.githubusercontent.com/AnjaDeric/MDA-TeamCroatia/main/county_info.csv?token=GHSAT0AAAAAABTD7Q46E5QXWT4E4MXKVW7WYT5FAGQ'
county_info = pd.read_csv(url)
county_info['fips'] = county_info['fips'].apply('{:0>5}'.format)

In [31]:
covid_data = county_info.copy()
covid_data.head()

Unnamed: 0,fips,county,state,lat,long,population
0,1001,Autauga,Alabama,32.539527,-86.644082,55869
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234
2,1005,Barbour,Alabama,31.868263,-85.387129,24686
3,1007,Bibb,Alabama,32.996421,-87.125115,22394
4,1009,Blount,Alabama,33.982109,-86.567906,57826


In [32]:
county_fips = covid_data['fips'].unique()

### 2.1.2 Get Data from Johns Hopkins GitHub

In [33]:
base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"

for date in dates:
  url = base_url + date + ".csv"
  c2021 = pd.read_csv(url)

  # fill in missing FIPS with 0s and format to 5 digits
  c2021['FIPS'] = c2021['FIPS'].fillna(0)
  c2021.FIPS = (c2021.FIPS.astype(int)).astype(object)
  c2021['FIPS'] = c2021['FIPS'].apply('{:0>5}'.format)
  c2021.rename(columns = {'FIPS':'fips'}, inplace = True)

  # only keep rows were FIPS matches one of the US county FIPS
  c2021_counties = c2021[c2021['fips'].isin(county_fips)]

  covid_data = pd.merge(covid_data, c2021_counties[['fips','Confirmed']], on="fips", how="left")
  date_col_name = 'd' + date.split('-')[0] + date.split('-')[1] + date.split('-')[2];
  covid_data.rename(columns = {'Confirmed':date_col_name}, inplace = True)


In [34]:
covid_data.head()

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,3990.0,3999.0,4029.0,4065.0,...,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0,11018.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,12666.0,12708.0,12825.0,12962.0,...,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0,39911.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,1396.0,1398.0,1406.0,1417.0,...,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0,3860.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,1725.0,1739.0,1746.0,1762.0,...,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0,4533.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,4441.0,4446.0,4465.0,4483.0,...,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0,11256.0


In [35]:
covid_data.to_csv('covid_data_raw.csv', index=False)

## 2.2 Cleaning COVID Data

In [38]:
# run to load saved data so you don't have to wait
# get data for other Utah counties
covid_data = pd.read_csv('https://raw.githubusercontent.com/AnjaDeric/MDA-TeamCroatia/main/covid_data_raw.csv')
covid_data['fips'] = covid_data['fips'].apply('{:0>5}'.format)
covid_data.head()

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,3990.0,3999.0,4029.0,4065.0,...,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0,11018.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,12666.0,12708.0,12825.0,12962.0,...,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0,39911.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,1396.0,1398.0,1406.0,1417.0,...,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0,3860.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,1725.0,1739.0,1746.0,1762.0,...,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0,4533.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,4441.0,4446.0,4465.0,4483.0,...,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0,11256.0


### 2.2.1 Missing Values

After gathering all the confirmed cases, we take a look at any rows with missing values. The only columns with missing values are the covid case count columns. 24 rows in total are missing all case count data.

In [39]:
covid_data[covid_data.isnull().any(axis=1)]

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
1186,25007,Dukes,Massachusetts,41.406747,-70.687635,17332,,,,,...,,,,,,,,,,
1192,25019,Nantucket,Massachusetts,41.294202,-70.087747,11399,,,,,...,,,,,,,,,,
2743,49001,Beaver,Utah,38.356571,-113.234223,6710,,,,,...,,,,,,,,,,
2744,49003,Box Elder,Utah,41.521068,-113.083282,56046,,,,,...,,,,,,,,,,
2745,49005,Cache,Utah,41.723306,-111.744367,128289,,,,,...,,,,,,,,,,
2746,49007,Carbon,Utah,39.648348,-110.587251,20463,,,,,...,,,,,,,,,,
2747,49009,Daggett,Utah,40.887983,-109.512109,950,,,,,...,,,,,,,,,,
2749,49013,Duchesne,Utah,40.297723,-110.425237,19938,,,,,...,,,,,,,,,,
2750,49015,Emery,Utah,38.996171,-110.701396,10012,,,,,...,,,,,,,,,,
2751,49017,Garfield,Utah,37.854472,-111.441876,5051,,,,,...,,,,,,,,,,


These rows can be split into 2 categories:



1.   Cases from Dukes and Nantucket counties from Massachussetts. In the COVID case data sets, these 2 counties are combined into one. However, because they are both islands and do not connect to continental US by road, we will exclude these from our final data set.
2.   Cases from Utah counties. Only around 7 counties for Utah actually have COVID data reported in this data set. To get data for the rest, we will be looking at a data set from another source (NY Times).



We first drop Nantucket and Dukes from the COVID data set. We additionally drop them from the county_info file.

In [40]:
# drop nantucket and dukes from COVID dataset
covid_data.drop(covid_data.loc[covid_data['fips']=='25007'].index, inplace=True)
covid_data.drop(covid_data.loc[covid_data['fips']=='25019'].index, inplace=True)

# drop nantucket and dukes from county_info dataset
county_info.drop(county_info.loc[county_info['fips']=='25007'].index, inplace=True)
county_info.drop(county_info.loc[county_info['fips']=='25019'].index, inplace=True)

county_fips = covid_data['fips'].unique()

For Utah counties, we get COVID data from NY times. For the time being, we drop the rows containing these Utah counties from the master covid_data file and work on it in a separate data frame. We first get a list of all Utah county FIPS that are missing data.

In [41]:
missing_data = covid_data[covid_data.isnull().any(axis=1)]
missing_fips = missing_data['fips'].unique()
missing_fips

array(['49001', '49003', '49005', '49007', '49009', '49013', '49015',
       '49017', '49019', '49021', '49023', '49025', '49027', '49029',
       '49031', '49033', '49039', '49041', '49047', '49053', '49055',
       '49057'], dtype=object)

In [42]:
# drop these rows from master file as we will be working on them separately
covid_data.drop(covid_data.loc[covid_data.isnull().any(axis=1)].index, inplace=True)

Now, we load the data from NY Times.

In [46]:
# get data for other Utah counties
nytimes_21 = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-2021.csv")
nytimes_21.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2021-01-01,Autauga,Alabama,1001.0,4239,50.0
1,2021-01-01,Baldwin,Alabama,1003.0,13823,169.0
2,2021-01-01,Barbour,Alabama,1005.0,1517,33.0
3,2021-01-01,Bibb,Alabama,1007.0,1854,46.0
4,2021-01-01,Blount,Alabama,1009.0,4693,63.0


In [47]:
# get data for other Utah counties
nytimes_20 = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-2020.csv")
nytimes_20.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


Reformat the FIPS code to match the 5-digit format so correct Utah counties can be extracted.



In [54]:
def extract_times_data(nytimes):
  # reformat FIPS to match 5-digit format
  nytimes['fips'] = nytimes['fips'].fillna(0)
  nytimes.fips = (nytimes.fips.astype(int)).astype(object)
  nytimes['fips'] = nytimes['fips'].apply('{:0>5}'.format)

  # get only rows with missing Utah counties
  nytimes = nytimes[nytimes['fips'].isin(missing_fips)]

  # change date to format used for covid_data columns
  nytimes[['yy','mm','dd']]=nytimes['date'].str.split('-', expand=True)
  nytimes["date"] = "d" + nytimes["mm"] + nytimes["dd"] + nytimes["yy"]

  # transform into wide table and add FIPS code
  nytimes_wide = nytimes.pivot(index='county',columns='date',values='cases')
  nytimes_wide['fips'] = nytimes['fips'].unique()

  return nytimes_wide

In [55]:
nytimes_wide_21 = extract_times_data(nytimes_21)
nytimes_wide_21.head()

date,d01012021,d01022021,d01032021,d01042021,d01052021,d01062021,d01072021,d01082021,d01092021,d01102021,...,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021,fips
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Beaver,405,410,410,414,424,427,436,439,445,445,...,1231,1231,1231,1231,1232,1237,1240,1240,1243,49001
Box Elder,3571,3658,3704,3733,3778,3872,3945,3997,4026,4053,...,10283,10283,10283,10283,10329,10339,10364,10364,10393,49003
Cache,11167,11351,11417,11500,11620,11807,11941,12065,12144,12211,...,24702,24702,24702,24702,24896,24943,25077,25077,25185,49005
Carbon,1072,1075,1079,1096,1101,1113,1127,1138,1147,1147,...,3712,3713,3713,3713,3713,3719,3728,3735,3749,49007
Daggett,18,18,18,18,20,21,22,22,22,22,...,120,120,120,120,119,122,123,123,123,49009


In [56]:
nytimes_wide_20 = extract_times_data(nytimes_20)
nytimes_wide_20.drop(nytimes_wide_20.columns[0:len(nytimes_wide_20.columns)-8], axis=1, inplace=True)
nytimes_wide_20.head()


date,d12252020,d12262020,d12272020,d12282020,d12292020,d12302020,d12312020,fips
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Beaver,341.0,355.0,355.0,362.0,375.0,395.0,405.0,49053
Box Elder,3340.0,3381.0,3402.0,3428.0,3464.0,3519.0,3571.0,49057
Cache,10539.0,10700.0,10723.0,10790.0,10887.0,10967.0,11167.0,49003
Carbon,1021.0,1028.0,1031.0,1048.0,1056.0,1064.0,1072.0,49005
Daggett,15.0,15.0,15.0,17.0,17.0,17.0,18.0,49021


In [58]:
missing_data_base = missing_data[['fips','county','state','lat','long','population']]
missing_data_all = pd.merge(missing_data_base, nytimes_wide_20, on='fips', how="left")
missing_data_all = pd.merge(missing_data_all, nytimes_wide_21, on='fips', how="left")
missing_data_all

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,49001,Beaver,Utah,38.356571,-113.234223,6710,13797.0,13933.0,13933.0,14126.0,...,1231,1231,1231,1231,1231,1232,1237,1240,1240,1243
1,49003,Box Elder,Utah,41.521068,-113.083282,56046,10539.0,10700.0,10723.0,10790.0,...,10271,10283,10283,10283,10283,10329,10339,10364,10364,10393
2,49005,Cache,Utah,41.723306,-111.744367,128289,1021.0,1028.0,1031.0,1048.0,...,24658,24702,24702,24702,24702,24896,24943,25077,25077,25185
3,49007,Carbon,Utah,39.648348,-110.587251,20463,2964.0,3001.0,3001.0,3062.0,...,3708,3712,3713,3713,3713,3713,3719,3728,3735,3749
4,49009,Daggett,Utah,40.887983,-109.512109,950,18124.0,18124.0,18451.0,18606.0,...,118,120,120,120,120,119,122,123,123,123
5,49013,Duchesne,Utah,40.297723,-110.425237,19938,696.0,696.0,696.0,726.0,...,3198,3207,3207,3207,3207,3217,3223,3228,3228,3237
6,49015,Emery,Utah,38.996171,-110.701396,10012,320.0,323.0,323.0,327.0,...,1738,1740,1740,1740,1740,1740,1744,1747,1750,1761
7,49017,Garfield,Utah,37.854472,-111.441876,5051,282.0,283.0,283.0,294.0,...,734,734,734,734,734,736,737,739,739,742
8,49019,Grand,Utah,38.981038,-109.570449,9754,841.0,841.0,841.0,902.0,...,1671,1672,1672,1672,1672,1672,1681,1695,1697,1707
9,49021,Iron,Utah,37.859036,-113.289738,54839,15.0,15.0,15.0,17.0,...,10313,10326,10326,10326,10326,10371,10395,10439,10439,10496


Finally, add the Utah county data to the master data frame for covid cases.

In [59]:
covid_data_final = pd.concat([covid_data,missing_data_all],ignore_index=True)
covid_data_final

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,01001,Autauga,Alabama,32.539527,-86.644082,55869,3990.0,3999.0,4029.0,4065.0,...,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0,11018.0
1,01003,Baldwin,Alabama,30.727750,-87.722071,223234,12666.0,12708.0,12825.0,12962.0,...,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0,39911.0
2,01005,Barbour,Alabama,31.868263,-85.387129,24686,1396.0,1398.0,1406.0,1417.0,...,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0,3860.0
3,01007,Bibb,Alabama,32.996421,-87.125115,22394,1725.0,1739.0,1746.0,1762.0,...,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0,4533.0
4,01009,Blount,Alabama,33.982109,-86.567906,57826,4441.0,4446.0,4465.0,4483.0,...,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0,11256.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3101,49041,Sevier,Utah,38.748371,-111.805027,21620,63.0,63.0,63.0,63.0,...,4086.0,4094.0,4094.0,4094.0,4094.0,4128.0,4142.0,4160.0,4160.0,4172.0
3102,49047,Uintah,Utah,40.124915,-109.517442,35734,527.0,529.0,529.0,536.0,...,5663.0,5678.0,5678.0,5678.0,5678.0,5693.0,5697.0,5710.0,5710.0,5721.0
3103,49053,Washington,Utah,37.280035,-113.504698,177556,341.0,355.0,355.0,362.0,...,32240.0,32303.0,32303.0,32303.0,32303.0,32560.0,32639.0,32767.0,32767.0,32888.0
3104,49055,Wayne,Utah,38.323358,-110.909680,2711,66.0,66.0,66.0,66.0,...,298.0,301.0,301.0,301.0,301.0,302.0,302.0,303.0,303.0,304.0


We check one final time to see if there are any rows with any missing data.

In [60]:
covid_data_final[covid_data_final.isnull().any(axis=1)]

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021


Since there are no more missing cells, we save the final csv file.

In [61]:
covid_data_final.to_csv('covid_data_clean.csv', index=False)

### 2.2.2 Formatting

In [None]:
covid_data_merged = covid_data.copy()
covid_data_merged.head()

Unnamed: 0,fips,county,state,lat,long,population,d0101,d0102,d0103,d0104,...,d1221,d1222,d1223,d1224,d1225,d1226,d1227,d1228,d1229,d1230
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,4239.0,4268.0,4305.0,4336.0,...,10692.0,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,13823.0,13955.0,14064.0,14187.0,...,38569.0,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,1517.0,1528.0,1530.0,1533.0,...,3725.0,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,1854.0,1863.0,1882.0,1885.0,...,4406.0,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,4693.0,4729.0,4746.0,4771.0,...,10990.0,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0


In [None]:
covid_data_merged['cases'] = covid_data_merged[covid_data_merged.columns[7:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),axis=1)
covid_data_merged.head()

Unnamed: 0,fips,county,state,lat,long,population,d0101,d0102,d0103,d0104,...,d1222,d1223,d1224,d1225,d1226,d1227,d1228,d1229,d1230,cases
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,4239.0,4268.0,4305.0,4336.0,...,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0,"4268.0,4305.0,4336.0,4546.0,4645.0,4705.0,4770..."
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,13823.0,13955.0,14064.0,14187.0,...,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0,"13955.0,14064.0,14187.0,14440.0,14656.0,14845...."
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,1517.0,1528.0,1530.0,1533.0,...,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0,"1528.0,1530.0,1533.0,1575.0,1597.0,1614.0,1634..."
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,1854.0,1863.0,1882.0,1885.0,...,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0,"1863.0,1882.0,1885.0,1923.0,1944.0,1981.0,2015..."
4,1009,Blount,Alabama,33.982109,-86.567906,57826,4693.0,4729.0,4746.0,4771.0,...,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0,"4729.0,4746.0,4771.0,4849.0,4898.0,4957.0,5018..."


In [None]:
covid_data_merged.drop(covid_data_merged.columns[6:370], axis=1, inplace=True)
covid_data_merged.head()

Unnamed: 0,fips,county,state,lat,long,population,cases
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,"4268.0,4305.0,4336.0,4546.0,4645.0,4705.0,4770..."
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,"13955.0,14064.0,14187.0,14440.0,14656.0,14845...."
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,"1528.0,1530.0,1533.0,1575.0,1597.0,1614.0,1634..."
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,"1863.0,1882.0,1885.0,1923.0,1944.0,1981.0,2015..."
4,1009,Blount,Alabama,33.982109,-86.567906,57826,"4729.0,4746.0,4771.0,4849.0,4898.0,4957.0,5018..."


## 2.3 Getting Active Case Counts

In the current data set (created in Section 2.2), the case counts listed are cumulative. For instance, in Autauga, AL, there are 4268 cumulative cases on January 1st, and 4305 cases on January 2nd. This means that on January second, 38 new cases were detected, on top of the already active cases. A case of COVID can be considered active 7 days after its detection. 

In order to give the best travel recommendations, we want to work with active cases, as opposed to cumulative or newly detected cases. The following section converts the cumulative case counts into active case counts. 

In [64]:
# run to load saved data so you don't have to wait for previous steps
covid_data = pd.read_csv('https://raw.githubusercontent.com/AnjaDeric/MDA-TeamCroatia/main/covid_data_clean.csv')
covid_data['fips'] = covid_data['fips'].apply('{:0>5}'.format)
covid_data.head()

Unnamed: 0,fips,county,state,lat,long,population,d12252020,d12262020,d12272020,d12282020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,3990.0,3999.0,4029.0,4065.0,...,10711.0,10746.0,10758.0,10785.0,10789.0,10800.0,10828.0,10886.0,10953.0,11018.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,12666.0,12708.0,12825.0,12962.0,...,38628.0,38690.0,38771.0,38864.0,38878.0,38942.0,39084.0,39298.0,39583.0,39911.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,1396.0,1398.0,1406.0,1417.0,...,3726.0,3730.0,3735.0,3740.0,3742.0,3742.0,3750.0,3777.0,3819.0,3860.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,1725.0,1739.0,1746.0,1762.0,...,4409.0,4410.0,4410.0,4414.0,4420.0,4426.0,4452.0,4472.0,4514.0,4533.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,4441.0,4446.0,4465.0,4483.0,...,11002.0,11031.0,11041.0,11053.0,11064.0,11072.0,11096.0,11128.0,11193.0,11256.0


### 2.3.1 Getting Newly Detected Cases

To get newly detected cases, we have to subtract the cumulative case count for each day from the previous cumulative case count. 

In [74]:
new_cases = covid_data.copy()
new_cases.loc[:,'d12252020':'d12312021'] = new_cases.loc[:,'d12252020':'d12312021'].diff(axis=1)
new_cases.drop(['d12252020'], axis=1, inplace=True)
new_cases.head()

Unnamed: 0,fips,county,state,lat,long,population,d12262020,d12272020,d12282020,d12292020,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,9.0,30.0,36.0,40.0,...,19.0,35.0,12.0,27.0,4.0,11.0,28.0,58.0,67.0,65.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,42.0,117.0,137.0,210.0,...,59.0,62.0,81.0,93.0,14.0,64.0,142.0,214.0,285.0,328.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,2.0,8.0,11.0,45.0,...,1.0,4.0,5.0,5.0,2.0,0.0,8.0,27.0,42.0,41.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,14.0,7.0,16.0,30.0,...,3.0,1.0,0.0,4.0,6.0,6.0,26.0,20.0,42.0,19.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,5.0,19.0,18.0,52.0,...,12.0,29.0,10.0,12.0,11.0,8.0,24.0,32.0,65.0,63.0


### 2.3.2 Getting Active Case Counts

In [79]:
active_cases = new_cases.copy()
active_cases.loc[:,'d12262020':'d12312021'] = active_cases.loc[:,'d12262020':'d12312021'].rolling(7,axis=1).sum()
active_cases.drop(['d12262020','d12272020','d12282020','d12292020','d12302020','d12312020'], axis=1, inplace=True)
active_cases.head()

Unnamed: 0,fips,county,state,lat,long,population,d01012021,d01022021,d01032021,d01042021,...,d12222021,d12232021,d12242021,d12252021,d12262021,d12272021,d12282021,d12292021,d12302021,d12312021
0,1001,Autauga,Alabama,32.539527,-86.644082,55869,249.0,269.0,276.0,271.0,...,70.0,100.0,97.0,122.0,120.0,121.0,136.0,175.0,207.0,260.0
1,1003,Baldwin,Alabama,30.72775,-87.722071,223234,1157.0,1247.0,1239.0,1225.0,...,215.0,249.0,314.0,380.0,372.0,421.0,515.0,670.0,893.0,1140.0
2,1005,Barbour,Alabama,31.868263,-85.387129,24686,121.0,130.0,124.0,116.0,...,7.0,10.0,13.0,18.0,19.0,17.0,25.0,51.0,89.0,125.0
3,1007,Bibb,Alabama,32.996421,-87.125115,22394,129.0,124.0,136.0,123.0,...,21.0,19.0,17.0,18.0,23.0,26.0,46.0,63.0,104.0,123.0
4,1009,Blount,Alabama,33.982109,-86.567906,57826,252.0,283.0,281.0,288.0,...,98.0,112.0,100.0,99.0,105.0,103.0,106.0,126.0,162.0,215.0


In [77]:
active_cases.to_csv('active_cases_2021.csv', index=False)

# 3 County Adjacency

## 3.1 Load County Adjacency Data

Start by loading in original data from the US Census Bureau ([link](https://www.census.gov/geographies/reference-files/2010/geo/county-adjacency.html)). Note: the text data file from the website was downloaded directly and saved as a CSV file (with custom delimiter option); no additional data processing was done in Excel.

In [None]:
url = "https://raw.githubusercontent.com/AnjaDeric/MDA_Personal/main/adjacent_counties.csv"
adj = pd.read_csv(url,encoding= 'unicode_escape')
adj.head()

Unnamed: 0,Column1,Column2,Column3,Column4
0,"Autauga County, AL",01001,"Autauga County, AL",1001.0
1,,"Chilton County, AL",01021,
2,,"Dallas County, AL",01047,
3,,"Elmore County, AL",01051,
4,,"Lowndes County, AL",01085,


## 3.2 Clean County Adjacency Data

Move columns 2 and 3 over to the right in rows following each new county.

In [None]:
adj.loc[adj['Column1'].isnull(),'Column4'] = adj['Column3']
adj.loc[adj['Column1'].isnull(),'Column3'] = adj['Column2']
adj.loc[adj['Column1'].isnull(),'Column2'] = np.nan
adj.head()

Unnamed: 0,Column1,Column2,Column3,Column4
0,"Autauga County, AL",1001.0,"Autauga County, AL",1001.0
1,,,"Chilton County, AL",1021.0
2,,,"Dallas County, AL",1047.0
3,,,"Elmore County, AL",1051.0
4,,,"Lowndes County, AL",1085.0


After manually checking the new dataframe, there is an error in one row, where Column4 contains a bordering county name and Column3 contains a fips code. Correct this mistake.

In [None]:
adj.loc[adj['Column4']=="Blue Earth County, MN",'Column3'] = "Blue Earth County, MN"
adj.loc[adj['Column4']=="Blue Earth County, MN",'Column4'] = "27013"

Convert fips code columns to object data type and add leading zeros for counties that are missing them. Aditionally, adjust column names to be accurate.

In [None]:
adj.Column4 = (adj.Column4.astype(int)).astype(object)
adj['Column4'] = adj['Column4'].apply('{:0>5}'.format)
adj.rename(columns = {'Column1':'county', 'Column2':'county_fips',
                      'Column3':'bcounty', 'Column4':'bcounty_fips'}, inplace = True)
adj.head()

Unnamed: 0,county,county_fips,bcounty,bcounty_fips
0,"Autauga County, AL",1001.0,"Autauga County, AL",1001
1,,,"Chilton County, AL",1021
2,,,"Dallas County, AL",1047
3,,,"Elmore County, AL",1051
4,,,"Lowndes County, AL",1085


For the first 2 columns, fill the missing values (NaN) with the countty and code in the row above.

In [None]:
adj['county'] = adj['county'].mask(adj['county'].eq('')).ffill()
adj['county_fips'] = adj['county_fips'].mask(adj['county_fips'].eq('')).ffill()
adj.head()

Unnamed: 0,county,county_fips,bcounty,bcounty_fips
0,"Autauga County, AL",1001,"Autauga County, AL",1001
1,"Autauga County, AL",1001,"Chilton County, AL",1021
2,"Autauga County, AL",1001,"Dallas County, AL",1047
3,"Autauga County, AL",1001,"Elmore County, AL",1051
4,"Autauga County, AL",1001,"Lowndes County, AL",1085


Remove rows where a county is bordering itself.

In [None]:
adj = adj[adj['county_fips'] != adj['bcounty_fips']]
adj.head()

Unnamed: 0,county,county_fips,bcounty,bcounty_fips
1,"Autauga County, AL",1001,"Chilton County, AL",1021
2,"Autauga County, AL",1001,"Dallas County, AL",1047
3,"Autauga County, AL",1001,"Elmore County, AL",1051
4,"Autauga County, AL",1001,"Lowndes County, AL",1085
5,"Autauga County, AL",1001,"Montgomery County, AL",1101


Split county names and states into separate columns.

In [None]:
adj[['county_name', 'county_state']] = adj['county'].str.split(',', expand=True)
adj[['bcounty_name', 'bcounty_state']] = adj['bcounty'].str.split(',', expand=True)
adj = adj[['county_name','county_state','county_fips','bcounty_name','bcounty_state','bcounty_fips']]
adj.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips
1,Autauga County,AL,1001,Chilton County,AL,1021
2,Autauga County,AL,1001,Dallas County,AL,1047
3,Autauga County,AL,1001,Elmore County,AL,1051
4,Autauga County,AL,1001,Lowndes County,AL,1085
5,Autauga County,AL,1001,Montgomery County,AL,1101


Drop any rows where either of the counties is not on our list of continental US counties.

In [None]:
# check to see county FIPS list
county_fips

array(['01001', '01003', '01005', ..., '56041', '56043', '56045'],
      dtype=object)

In [None]:
# only keep rows were FIPS matches one of the US county FIPS in the list
adj = adj[adj['county_fips'].isin(county_fips)]
adj = adj[adj['bcounty_fips'].isin(county_fips)]

Drop duplicate rows where the same two bordering counties are being considered. This is done based on their combined FIPS codes sorted by value, so that any 2 rows that have the same 2 counties will have the same combined FIPS.

In [None]:
adj['county_combined'] = [''.join(x) for x in np.sort(adj[['county_fips','bcounty_fips']], axis=1)]
adj_final = adj.drop_duplicates(subset=['county_combined'], keep='first')
#adj_final = adj_final.drop('county_combined', 1)
adj_final = adj_final.reset_index(drop=True)

In [None]:
adj_final.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,county_combined
0,Autauga County,AL,1001,Chilton County,AL,1021,100101021
1,Autauga County,AL,1001,Dallas County,AL,1047,100101047
2,Autauga County,AL,1001,Elmore County,AL,1051,100101051
3,Autauga County,AL,1001,Lowndes County,AL,1085,100101085
4,Autauga County,AL,1001,Montgomery County,AL,1101,100101101


## 3.3 Complete and Export Data

The data set is now complete and corrected. The format is as follows:

*   **county_name**: name of the county
*   **county_state**: state the county is in 
*   **county_fips**: FIPS code of the county
*   **bcounty_name**: name of the bordering county
*   **bcounty_state**: state the bordering county is in 
*   **bcounty_fips**: FIPS code of the bordering county
*   **county_combined**: FIPS codes of the two counties combined into one line and sorted by value

Export the data set into a new csv file.



In [None]:
adj_final.to_csv('adjacent_counties_corrected.csv', index=False)

# 4 Country Distance Data

## 4.1 Great-Circle Distances

These inter-county distances were collected from the [National Bureau of Economic Research](https://www.nber.org/research/data/county-distance-database). They are great-circle distances calculated using the Havershine formula and as a result, they might not match the actual driving distance between the counties. Additionally, the raw data includes the distances between all combinations of counties, not just bordering counties.

Since the file is too large to be stored on GitHub, it was loaded into our GoogleDrive and used from there.

In [None]:
url = "/content/drive/MyDrive/MDA/sf12010countydistancemiles.csv"
dist = pd.read_csv(url)
dist.head()

Unnamed: 0,county1,mi_to_county,county2
0,1001,22.462994,1021
1,1001,26.844687,1085
2,1001,29.517585,1051
3,1001,30.776371,1047
4,1001,34.493443,1101


In [None]:
dist.county1 = (dist.county1.astype(int)).astype(object)
dist['county1'] = dist['county1'].apply('{:0>5}'.format)
dist.county2 = (dist.county2.astype(int)).astype(object)
dist['county2'] = dist['county2'].apply('{:0>5}'.format)
dist.head()

Unnamed: 0,county1,mi_to_county,county2
0,1001,22.462994,1021
1,1001,26.844687,1085
2,1001,29.517585,1051
3,1001,30.776371,1047
4,1001,34.493443,1101


In [None]:
dist['county_combined'] = [''.join(x) for x in np.sort(dist[['county1','county2']], axis=1)]
dist = dist.drop_duplicates(subset=['county_combined'], keep='first')
dist.head()

Unnamed: 0,county1,mi_to_county,county2,county_combined
0,1001,22.462994,1021,100101021
1,1001,26.844687,1085,100101085
2,1001,29.517585,1051,100101051
3,1001,30.776371,1047,100101047
4,1001,34.493443,1101,100101101


In [None]:
adj_distances_GC = pd.merge(adj_final, dist[['mi_to_county','county_combined']], on="county_combined", how="left")
adj_distances_GC.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,county_combined,mi_to_county
0,Autauga County,AL,1001,Chilton County,AL,1021,100101021,22.462994
1,Autauga County,AL,1001,Dallas County,AL,1047,100101047,30.776371
2,Autauga County,AL,1001,Elmore County,AL,1051,100101051,29.517585
3,Autauga County,AL,1001,Lowndes County,AL,1085,100101085,26.844687
4,Autauga County,AL,1001,Montgomery County,AL,1101,100101101,34.493443


In [None]:
adj_distances_GC.isna().sum()

county_name        0
county_state       0
county_fips        0
bcounty_name       0
bcounty_state      0
bcounty_fips       0
county_combined    0
mi_to_county       0
dtype: int64

In [None]:
adj_distances_GC.to_csv('adj_distances_GC.csv', index=False)

## 4.2 Driving Distances

### 4.2.1 API Prep + Requests

To get driving distances and durations, we will be using the [Distance Matrix API](https://distancematrix.ai/dev#travel_modes). We are using a free API key which allows for 5000 requests and has a trial period of 7 days. 2 API keys in total were used to get all the necessary data.

Note: As the free API key expires after 7 days, this section of the code will not run past 7 days after requesting the key or after exceeding the 5000 request limit. 

In [None]:
import requests

In [None]:
BASE_URL = "https://api.distancematrix.ai"
api_key = "mM8V0cY63RO4kNquWS3C2X1ClUxlp"
mode = "driving"
traffic_model = "best_guess"

### 4.2.2 Prepare Final Data Frame

Add columns for distance and duration measurements to the data frame.

In [None]:
adj_final_api = adj_final.copy()
adj_final_api['distance'] = np.nan
adj_final_api['duration'] = np.nan

Add origin latitude and longitude to the data frame by merging it with the county_info data frame based on the FIPS code.

In [None]:
adj_final_api = pd.merge(adj_final_api, county_info[['lat','long','fips']], 
                         left_on="county_fips", right_on='fips', how="left")
adj_final_api.rename(columns={'lat': 'origin_lat', 'long': 'origin_long'}, inplace=True)
adj_final_api = adj_final_api.drop('fips', axis=1)
adj_final_api.head()

  after removing the cwd from sys.path.


Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,county_combined,distance,duration,origin_lat,origin_long
0,Autauga County,AL,1001,Chilton County,AL,1021,100101021,,,32.539527,-86.644082
1,Autauga County,AL,1001,Dallas County,AL,1047,100101047,,,32.539527,-86.644082
2,Autauga County,AL,1001,Elmore County,AL,1051,100101051,,,32.539527,-86.644082
3,Autauga County,AL,1001,Lowndes County,AL,1085,100101085,,,32.539527,-86.644082
4,Autauga County,AL,1001,Montgomery County,AL,1101,100101101,,,32.539527,-86.644082


Add destination latitude and longitude to the data frame by merging it with the county_info data frame based on the FIPS code.

In [None]:
adj_final_api = pd.merge(adj_final_api, county_info[['lat','long','fips']], 
                         left_on="bcounty_fips", right_on='fips',how="left")
adj_final_api.rename(columns={'lat': 'dest_lat', 'long': 'dest_long'}, inplace=True)
adj_final_api = adj_final_api.drop('fips', axis=1)
adj_final_api.head()

  after removing the cwd from sys.path.


Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,county_combined,distance,duration,dest_lat,dest_long
0,Autauga County,AL,1001,Chilton County,AL,1021,100101021,,,32.850441,-86.717326
1,Autauga County,AL,1001,Dallas County,AL,1047,100101047,,,32.326881,-87.108667
2,Autauga County,AL,1001,Elmore County,AL,1051,100101051,,,32.597854,-86.144153
3,Autauga County,AL,1001,Lowndes County,AL,1085,100101085,,,32.159728,-86.651584
4,Autauga County,AL,1001,Montgomery County,AL,1101,100101101,,,32.220683,-86.209693


### 4.3.3 Make API Requests

The following block of code generates the API request URLs and makes the actual requests. The request URLs are generated by combining the base URL with all the necessary parameters, including the latitude and longitude of the origin and the destination.

Once a response is received, the distance and the duration elements are extracted and stored in the county adjacency data frame.

In [None]:
for idx in range(0,1):
  origin_lat = adj_final_api.loc[idx, 'origin_lat']
  origin_lng = adj_final_api.loc[idx, 'origin_long']

  dest_lat = adj_final_api.loc[idx, 'dest_lat']
  dest_lng = adj_final_api.loc[idx, 'dest_long']

  if((math.isnan(float(origin_lat)) == 0) & (math.isnan(float(origin_lng)) == 0) & 
    (math.isnan(float(dest_lat)) == 0) & (math.isnan(float(dest_lng)) == 0)):

    origin = str(origin_lat) + "," + str(origin_lng)
    destination = str(dest_lat) + "," + str(dest_lng)

    url = "{base_url}/maps/api/distancematrix/json" \
            "?key={api_key}" \
            "&origins={origin}" \
            "&destinations={destination}" \
            "&mode={mode}" \
            "&traffic_model={traffic_model}".format(base_url=BASE_URL,
                                                      api_key=api_key,
                                                      origin=origin,
                                                      destination=destination,
                                                      mode=mode,
                                                      traffic_model=traffic_model)
    # logging.info("URL: %s" % url)
    # result = requests.get(url)

    res_json = result.json()
    if(res_json['rows'][0]['elements'][0]['status']=='OK'):
      adj_final_api.loc[idx, 'distance'] = res_json['rows'][0]['elements'][0]['distance']['text']
      adj_final_api.loc[idx, 'duration'] = res_json['rows'][0]['elements'][0]['duration']['text']

    if(idx%100==0):
      print(idx)


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400


In [None]:
adj_final_api.to_csv('adjacent_counties_final_api_2.csv', index=False)

### 4.2.4 Clean Up API Data

In [None]:
url = "https://raw.githubusercontent.com/AnjaDeric/MDA_Personal/main/adjacent_counties_final_api_2.csv"
adj_final_api = pd.read_csv(url)
adj_final_api.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,county_combined,distance,duration,origin_lat,origin_lng,dest_lat,dest_lng
0,Autauga County,AL,1001,Chilton County,AL,1021,100101021,42.8 km,34 mins,32.5349,-86.6427,32.8479,-86.7188
1,Autauga County,AL,1001,Dallas County,AL,1047,100101047,69.9 km,58 mins,32.5349,-86.6427,32.326,-87.1065
2,Autauga County,AL,1001,Elmore County,AL,1051,100101051,58.6 km,51 mins,32.5349,-86.6427,32.5966,-86.1492
3,Autauga County,AL,1001,Lowndes County,AL,1085,100101085,80.6 km,1 hour 2 mins,32.5349,-86.6427,32.1547,-86.6501
4,Autauga County,AL,1001,Montgomery County,AL,1101,100101101,64.9 km,56 mins,32.5349,-86.6427,32.2203,-86.2076


Drop columns we no longer need

In [None]:
adj_final_api = adj_final_api.drop(['county_combined','origin_lat',
                                    'origin_lng','dest_lat','dest_lng'], axis = 1)
adj_final_api.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,distance,duration
0,Autauga County,AL,1001,Chilton County,AL,1021,42.8 km,34 mins
1,Autauga County,AL,1001,Dallas County,AL,1047,69.9 km,58 mins
2,Autauga County,AL,1001,Elmore County,AL,1051,58.6 km,51 mins
3,Autauga County,AL,1001,Lowndes County,AL,1085,80.6 km,1 hour 2 mins
4,Autauga County,AL,1001,Montgomery County,AL,1101,64.9 km,56 mins


Extract number of km from the distance column and convert it into a float format.

In [None]:
adj_final_api['distance'].replace(regex=True, inplace=True, to_replace=r'[^0-9.\-]', value=r'')
adj_final_api['distance'] = adj_final_api['distance'].astype(float)
adj_final_api.rename(columns={'distance': 'distance_km'}, inplace=True)
adj_final_api.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,distance_km,duration
0,Autauga County,AL,1001,Chilton County,AL,1021,42.8,34 mins
1,Autauga County,AL,1001,Dallas County,AL,1047,69.9,58 mins
2,Autauga County,AL,1001,Elmore County,AL,1051,58.6,51 mins
3,Autauga County,AL,1001,Lowndes County,AL,1085,80.6,1 hour 2 mins
4,Autauga County,AL,1001,Montgomery County,AL,1101,64.9,56 mins


Add a column to get distance in miles in addition to kilometers.

In [None]:
adj_final_api['distance_mi'] = round(adj_final_api['distance_km']*0.6214,1)
adj_final_api.head()

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,distance_km,duration,distance_mi
0,Autauga County,AL,1001,Chilton County,AL,1021,42.8,34 mins,26.6
1,Autauga County,AL,1001,Dallas County,AL,1047,69.9,58 mins,43.4
2,Autauga County,AL,1001,Elmore County,AL,1051,58.6,51 mins,36.4
3,Autauga County,AL,1001,Lowndes County,AL,1085,80.6,1 hour 2 mins,50.1
4,Autauga County,AL,1001,Montgomery County,AL,1101,64.9,56 mins,40.3


In [None]:
test = adj_final_api.copy()
test = test[test['duration'].isnull()==False]
test

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,distance_km,duration,distance_mi
0,Autauga County,AL,1001,Chilton County,AL,1021,42.8,34 mins,26.6
1,Autauga County,AL,1001,Dallas County,AL,1047,69.9,58 mins,43.4
2,Autauga County,AL,1001,Elmore County,AL,1051,58.6,51 mins,36.4
3,Autauga County,AL,1001,Lowndes County,AL,1085,80.6,1 hour 2 mins,50.1
4,Autauga County,AL,1001,Montgomery County,AL,1101,64.9,56 mins,40.3
...,...,...,...,...,...,...,...,...,...
4495,Amite County,MS,28005,Pike County,MS,28113,44.4,34 mins,27.6
4496,Amite County,MS,28005,Wilkinson County,MS,28157,68.3,53 mins,42.4
4497,Attala County,MS,28007,Carroll County,MS,28015,69.2,48 mins,43.0
4498,Attala County,MS,28007,Choctaw County,MS,28019,49.9,37 mins,31.0


In [None]:
# using apply function to create a new column
test['duration_min'] = test.apply(lambda row: parsex(row.duration), axis = 1)
test

Unnamed: 0,county_name,county_state,county_fips,bcounty_name,bcounty_state,bcounty_fips,distance_km,duration,distance_mi,duration_min
0,Autauga County,AL,1001,Chilton County,AL,1021,42.8,34 mins,26.6,34
1,Autauga County,AL,1001,Dallas County,AL,1047,69.9,58 mins,43.4,58
2,Autauga County,AL,1001,Elmore County,AL,1051,58.6,51 mins,36.4,51
3,Autauga County,AL,1001,Lowndes County,AL,1085,80.6,1 hour 2 mins,50.1,62
4,Autauga County,AL,1001,Montgomery County,AL,1101,64.9,56 mins,40.3,56
...,...,...,...,...,...,...,...,...,...,...
4495,Amite County,MS,28005,Pike County,MS,28113,44.4,34 mins,27.6,34
4496,Amite County,MS,28005,Wilkinson County,MS,28157,68.3,53 mins,42.4,53
4497,Attala County,MS,28007,Carroll County,MS,28015,69.2,48 mins,43.0,48
4498,Attala County,MS,28007,Choctaw County,MS,28019,49.9,37 mins,31.0,37


In [None]:
def parsex(s):
    hh = mm = 0
    for word in s.split():
        word = word.lower()
        if word.isdigit():
            save = word
        elif word.startswith('hour'):
            hh = int(save)
        elif word.startswith('mins'):
            mm = int(save)
    return (hh*60+mm)

In [None]:
adj_final_api.dtypes

county_name       object
county_state      object
county_fips        int64
bcounty_name      object
bcounty_state     object
bcounty_fips       int64
distance_km      float64
duration          object
distance_mi      float64
dtype: object