In [2]:
import json
import pandas as pd
import requests
from pprint import pprint
from pandas.io.json import json_normalize  

In [39]:
#Import CSVs
aqi2019_df = pd.read_csv("assets/sample_data/daily_aqi_by_cbsa_2019.csv")
aqi2020_df = pd.read_csv("aqidaily2020.csv")

print(aqi2019_df.head())
print(aqi2020_df.head())

CBSA  CBSA Code        Date  AQI Category Defining Parameter  \
0  Aberdeen, SD      10100  2019-01-03    4     Good               PM10   
1  Aberdeen, SD      10100  2019-01-06   31     Good              PM2.5   
2  Aberdeen, SD      10100  2019-01-09    9     Good               PM10   
3  Aberdeen, SD      10100  2019-01-12   43     Good              PM2.5   
4  Aberdeen, SD      10100  2019-01-15   28     Good              PM2.5   

  Defining Site  Number of Sites Reporting  
0   46-013-0003                          1  
1   46-013-0003                          1  
2   46-013-0003                          1  
3   46-013-0003                          1  
4   46-013-0003                          1  
       Date         CITY State       Lat        Lng  AQI  Category  \
0  1/1/2020  Albuquerque    NM  35.08449 -106.65114   54       NaN   
1  1/2/2020  Albuquerque    NM  35.08449 -106.65114   39       NaN   
2  1/3/2020  Albuquerque    NM  35.08449 -106.65114   38       NaN   
3  1/4/202

In [40]:
#Rename cities with duplicate names to avoid merging the wrong data
aqi2019_df = aqi2019_df.replace({"Columbia, MO": "ColumbiaMO", "Salem, OH": "SalemOH", "Columbus, GA-AL" : "ColumbusGA", "Columbus, IN" : "ColumbusIN",
                                "Jackson, TN": "JacksonTN", "Jackson, WY-ID": "JacksonWY", "Miami, OK": "MiamiOK", "Salem, OH": "SalemOH",
                                "Springfield, MA": "SpringfieldMA", "Springfield, MO" : "SpringfieldMO", "Springfield, OH": "SpringfieldOH"})



In [41]:
#Split CBSA into City and State
aqi2019_df[['City','State']] = aqi2019_df.CBSA.str.split(", ",expand=True)

aqi2019_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State
0,"Aberdeen, SD",10100,2019-01-03,4,Good,PM10,46-013-0003,1,Aberdeen,SD
1,"Aberdeen, SD",10100,2019-01-06,31,Good,PM2.5,46-013-0003,1,Aberdeen,SD
2,"Aberdeen, SD",10100,2019-01-09,9,Good,PM10,46-013-0003,1,Aberdeen,SD
3,"Aberdeen, SD",10100,2019-01-12,43,Good,PM2.5,46-013-0003,1,Aberdeen,SD
4,"Aberdeen, SD",10100,2019-01-15,28,Good,PM2.5,46-013-0003,1,Aberdeen,SD


In [42]:
#Rename City metropolitan areas to one main city for merging
aqi2019_df = aqi2019_df.replace({"Atlanta-Sandy Springs-Roswell":"Atlanta","Austin-Round Rock":"Austin","Baltimore-Columbia-Towson":"Baltimore",
                        "Boise City":"Boise","Boston-Cambridge-Newton":"Boston","New York-Newark-Jersey City":"New York City",                                                  "Charlotte-Concord-Gastonia":"Charlotte","Chicago-Naperville-Elgin":"Chicago","Dallas-Fort Worth-Arlington":"Dallas",
                        "Denver-Aurora-Lakewood":"Denver","Detroit-Warren-Dearborn":"Detroit","Hartford-West Hartford-East Hartford":"Hartford","Urban                          Honolulu":"Honolulu", "Houston-The Woodlands-Sugar Land":"Houston","Indianapolis-Carmel-Anderson":"Indianapolis","Las                                   Vegas-Henderson-Paradise":"Las Vegas","Little Rock-North Little Rock-Conway":"Little Rock","Los Angeles-Long Beach-Anaheim":"Los Angeles","Miami-Fort Lauderdale-West Palm Beach":"Miami","Milwaukee-Waukesha-West Allis":"Milwaukee",                     
                        "Nashville-Davidson--Murfreesboro--Franklin":"Nashville","Omaha-Council Bluffs":"Omaha","Philadelphia-Camden-Wilmington":
                        "Philadelphia","Phoenix-Mesa-Scottsdale":"Phoenix","Portland-Vancouver-Hillsboro":"Portland","Providence-Warwick":"Providence",
                        "Sacramento--Roseville--Arden-Arcade":"Sacramento","Minneapolis-St. Paul-Bloomington":"Saint Paul","San Antonio-New Braunfels":
                        "San Antonio","San Diego-Carlsbad":"San Diego","San Francisco-Oakland-Hayward":"San Francisco","San Jose-Sunnyvale-Santa Clara":
                        "San Jose","Seattle-Tacoma-Bellevue":"Seattle","Washington-Arlington-Alexandria":"Washington, D.C."})

In [43]:
#Replace state values of multiple states with one state
aqi2019_df = aqi2019_df.replace({"MA-NH":"MA","IL-IN-WI":"IL","NC-SC":"NC","TN-MS-AR":"TN","MN-WI":"MN","NY-NJ-PA":"NY","NE-IA":"NE",
                                    "PA-NJ-DE-MD":"PA","OR-WA":"OR","RI-MA":"RI","DC-VA-MD-WV":"DC"})

In [44]:
aqi2019_df["State"].value_counts()

CA          12268
TX           9244
PA           8296
FL           7801
WA           6883
            ...  
AR-OK         361
SC-NC         356
TX-AR         349
IL-MO         241
IA-IL-MO      120
Name: State, Length: 85, dtype: int64

In [45]:
aqi2019_df.loc[aqi2019_df["City"] == "Chicago", "State"]

29181    IL
29182    IL
29183    IL
29184    IL
29185    IL
         ..
29541    IL
29542    IL
29543    IL
29544    IL
29545    IL
Name: State, Length: 365, dtype: object

In [48]:
# URL for JSON with cities and geo data
url = "https://aqicn.org/data-platform/covid19/airquality-covid19-cities.json"

In [24]:
#Read JSON into a dataframe
df = pd.read_json(url)

df.head()

Unnamed: 0,csvsize,data,generated
0,46726470,"{'Place': {'geo': [38.53575, 68.77905], 'featu...",2020-05-28T20:54:02+01:00
1,46726470,"{'Place': {'geo': [24.45118, 54.39696], 'featu...",2020-05-28T20:54:02+01:00
2,46726470,"{'Place': {'geo': [25.07725, 55.30927], 'featu...",2020-05-28T20:54:02+01:00
3,46726470,"{'Place': {'geo': [60.29414, 25.04099], 'featu...",2020-05-28T20:54:02+01:00
4,46726470,"{'Place': {'geo': [60.45148, 22.26869], 'featu...",2020-05-28T20:54:02+01:00


In [49]:
# Uses requests.get() to load JSON into text
response = json.loads(requests.get(url).text)

#Flattens the nested JSON of the 'data' column into its own DataFrame to get the Places geo data
geo_df = pd.io.json.json_normalize(response['data'])
#pprint(response)
geo_df.head()

Unnamed: 0,Sources,Stations,Place.geo,Place.feature,Place.name,Place.country,Place.pop
0,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Conakry US Embassy, Guinea'}]","[9.53795, -13.67729]",PPLC,Conakry,GN,1767200
1,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'US Embassy, Guatemala City, Guatema...","[14.64072, -90.51327]",PPLC,Guatemala City,GT,994938
2,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Kacheripady, Ernakulam, India'}, {'...","[10.51667, 76.21667]",PPLA2,Thrissur,IN,325110
3,[{'name': 'World Meteorological Organization -...,"[{'Name': 'New Delhi US Embassy, India (नई दिल...","[28.63576, 77.22445]",PPLC,New Delhi,IN,317797
4,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Zoo Park, Bahadurpura West, Hyderab...","[17.38405, 78.45636]",PPLA,Hyderabad,IN,3597816


In [50]:
#Puts the geo data and name into a dataframe
places_df = geo_df[['Place.geo','Place.name','Place.country','Place.pop']]

In [51]:
places_df.head()

Unnamed: 0,Place.geo,Place.name,Place.country,Place.pop
0,"[9.53795, -13.67729]",Conakry,GN,1767200
1,"[14.64072, -90.51327]",Guatemala City,GT,994938
2,"[10.51667, 76.21667]",Thrissur,IN,325110
3,"[28.63576, 77.22445]",New Delhi,IN,317797
4,"[17.38405, 78.45636]",Hyderabad,IN,3597816


In [52]:
#Rename columns for merging purposes
places_df = places_df.rename(columns = {'Place.geo':'Geo','Place.name':'City', 'Place.country':'Country', "Place.pop":"Population"})
places_df.head()

Unnamed: 0,Geo,City,Country,Population
0,"[9.53795, -13.67729]",Conakry,GN,1767200
1,"[14.64072, -90.51327]",Guatemala City,GT,994938
2,"[10.51667, 76.21667]",Thrissur,IN,325110
3,"[28.63576, 77.22445]",New Delhi,IN,317797
4,"[17.38405, 78.45636]",Hyderabad,IN,3597816


In [54]:
#Select only the US values from the dataframe
us_df = places_df.loc[places_df["Country"] == "US"]
print(us_df["City"].value_counts())
us_df["City"].isin(aqi2019_df["City"]).value_counts()

Staten Island       1
Atlanta             1
Portland            1
Seattle             1
Baltimore           1
Albuquerque         1
Indianapolis        1
Columbus            1
Jacksonville        1
Jackson             1
Omaha               1
Providence          1
Denver              1
Tucson              1
Manhattan           1
Washington, D.C.    1
Oakland             1
Philadelphia        1
El Paso             1
Saint Paul          1
Miami               1
Houston             1
Queens              1
The Bronx           1
Hartford            1
San Jose            1
Honolulu            1
Las Vegas           1
Nashville           1
Phoenix             1
Springfield         1
San Antonio         1
Milwaukee           1
Fresno              1
Fort Worth          1
Charlotte           1
Boise               1
Oklahoma City       1
Salem               1
Dallas              1
Chicago             1
Tallahassee         1
Los Angeles         1
Richmond            1
Boston              1
San Franci

True     48
False     9
Name: City, dtype: int64

In [55]:
#Replace Manhattan with New York City for the merge
us_df = us_df.replace({"Manhattan":"New York City"})

In [21]:
us_df.to_csv("us_locations.csv")

In [56]:
#Inner merge
new2019_df = aqi2019_df.merge(us_df,on="City",how="inner")

new2019_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State,Geo,Country,Population
0,"Albuquerque, NM",10740,2019-01-01,42,Good,PM10,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121
1,"Albuquerque, NM",10740,2019-01-02,40,Good,PM2.5,35-001-1012,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121
2,"Albuquerque, NM",10740,2019-01-03,55,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121
3,"Albuquerque, NM",10740,2019-01-04,57,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121
4,"Albuquerque, NM",10740,2019-01-05,66,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121


In [59]:

new2019_df["State"].value_counts()

CA    2190
TX    1825
FL    1093
AZ     730
TN     730
WI     730
NC     730
IL     725
OR     721
NE     365
CO     365
MS     365
GA     365
CT     365
AR     365
PA     365
OH     365
OK     365
VA     365
WA     365
RI     365
DC     365
MD     365
NM     365
IN     365
ID     365
MA     365
NY     365
MN     365
MI     365
UT     365
SC     365
Name: State, dtype: int64

In [73]:
#Store data to CSV for verification
new_df.to_csv("sample_data.csv")

In [60]:
#Split up the location data into latitude and longitude
loc_df = pd.DataFrame(new2019_df['Geo'].to_list(),index=None, columns =['Lat','Lng'])

loc_df

Unnamed: 0,Lat,Lng
0,35.08449,-106.65114
1,35.08449,-106.65114
2,35.08449,-106.65114
3,35.08449,-106.65114
4,35.08449,-106.65114
...,...,...
17864,38.89511,-77.03637
17865,38.89511,-77.03637
17866,38.89511,-77.03637
17867,38.89511,-77.03637


In [61]:
#Join the divided lat/lng df to the main dataframe
new2019_df = new2019_df.join(loc_df)

new2019_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State,Geo,Country,Population,Lat,Lng
0,"Albuquerque, NM",10740,2019-01-01,42,Good,PM10,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121,35.08449,-106.65114
1,"Albuquerque, NM",10740,2019-01-02,40,Good,PM2.5,35-001-1012,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121,35.08449,-106.65114
2,"Albuquerque, NM",10740,2019-01-03,55,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121,35.08449,-106.65114
3,"Albuquerque, NM",10740,2019-01-04,57,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121,35.08449,-106.65114
4,"Albuquerque, NM",10740,2019-01-05,66,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]",US,559121,35.08449,-106.65114


In [62]:
#Print out the data types and the first values of the 2020 dataframe
print(aqi2020_df.dtypes)
aqi2020_df.head()

Date                   object
CITY                   object
State                  object
Lat                   float64
Lng                   float64
AQI                     int64
Category              float64
Defining Parameter     object
Source                 object
CO                     object
Ozone                  object
SO2                    object
PM10                   object
PM25                   object
NO2                    object
dtype: object


Unnamed: 0,Date,CITY,State,Lat,Lng,AQI,Category,Defining Parameter,Source,CO,Ozone,SO2,PM10,PM25,NO2
0,1/1/2020,Albuquerque,NM,35.08449,-106.65114,54,,PM2.5,AirNow,42,54,,,,
1,1/2/2020,Albuquerque,NM,35.08449,-106.65114,39,,Ozone,AirNow,39,30,,,,
2,1/3/2020,Albuquerque,NM,35.08449,-106.65114,38,,Ozone,AirNow,38,37,,,,
3,1/4/2020,Albuquerque,NM,35.08449,-106.65114,50,,PM2.5,AirNow,36,50,,,,
4,1/5/2020,Albuquerque,NM,35.08449,-106.65114,63,,PM2.5,AirNow,37,63,,,,


In [64]:
#Set the category value for AQI based on the AQI value
aqi2020_df.loc[aqi2020_df["AQI"] <= 50, 'Category'] = "Good"
aqi2020_df.loc[(aqi2020_df["AQI"] > 50) & (aqi2020_df["AQI"] <= 100), 'Category'] = "Moderate"
aqi2020_df.loc[(aqi2020_df["AQI"] > 100) & (aqi2020_df["AQI"] <= 150), 'Category'] = "Unhealthy for Sensitive Groups"
aqi2020_df.loc[(aqi2020_df["AQI"] > 150) & (aqi2020_df["AQI"] <= 200), 'Category'] = "Unhealthy"
aqi2020_df.loc[(aqi2020_df["AQI"] > 250) & (aqi2020_df["AQI"] <= 300), 'Category'] = "Very Unhealthy"
aqi2020_df.loc[aqi2020_df["AQI"] > 300, 'Category'] = "Hazardous"

In [65]:
aqi2020_df.head()

Unnamed: 0,Date,CITY,State,Lat,Lng,AQI,Category,Defining Parameter,Source,CO,Ozone,SO2,PM10,PM25,NO2
0,1/1/2020,Albuquerque,NM,35.08449,-106.65114,54,Moderate,PM2.5,AirNow,42,54,,,,
1,1/2/2020,Albuquerque,NM,35.08449,-106.65114,39,Good,Ozone,AirNow,39,30,,,,
2,1/3/2020,Albuquerque,NM,35.08449,-106.65114,38,Good,Ozone,AirNow,38,37,,,,
3,1/4/2020,Albuquerque,NM,35.08449,-106.65114,50,Good,PM2.5,AirNow,36,50,,,,
4,1/5/2020,Albuquerque,NM,35.08449,-106.65114,63,Moderate,PM2.5,AirNow,37,63,,,,


In [66]:
#Rename column for merging purposes
aqi2020_df = aqi2020_df.rename(columns = {"CITY":"City"})


In [68]:
#Reorder data with data we need
aqi2020_df = aqi2020_df[["Date","City","State","Lat","Lng","AQI","Category","Defining Parameter"]]

In [70]:
#Add 2020 data to the 2019 data
data_df = new2019_df.append(aqi2020_df)
data_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State,Geo,Country,Population,Lat,Lng
0,"Albuquerque, NM",10740.0,2019-01-01,42,Good,PM10,35-001-0029,8.0,Albuquerque,NM,"[35.08449, -106.65114]",US,559121.0,35.08449,-106.65114
1,"Albuquerque, NM",10740.0,2019-01-02,40,Good,PM2.5,35-001-1012,8.0,Albuquerque,NM,"[35.08449, -106.65114]",US,559121.0,35.08449,-106.65114
2,"Albuquerque, NM",10740.0,2019-01-03,55,Moderate,PM2.5,35-001-0029,8.0,Albuquerque,NM,"[35.08449, -106.65114]",US,559121.0,35.08449,-106.65114
3,"Albuquerque, NM",10740.0,2019-01-04,57,Moderate,PM2.5,35-001-0029,8.0,Albuquerque,NM,"[35.08449, -106.65114]",US,559121.0,35.08449,-106.65114
4,"Albuquerque, NM",10740.0,2019-01-05,66,Moderate,PM2.5,35-001-0029,8.0,Albuquerque,NM,"[35.08449, -106.65114]",US,559121.0,35.08449,-106.65114


In [71]:
data_df["City"].value_counts()

Phoenix             516
Saint Paul          516
San Jose            515
Washington, D.C.    515
Raleigh             515
Jackson             515
Salt Lake City      515
Atlanta             515
Omaha               515
Los Angeles         515
Sacramento          515
Columbia            515
Denver              515
Boise               515
Providence          515
Boston              515
San Antonio         515
Charlotte           515
Portland            515
Tucson              515
Richmond            515
Fresno              515
San Francisco       515
Nashville           515
New York City       515
Chicago             515
Detroit             515
San Diego           515
Seattle             515
Oklahoma City       515
Memphis             515
Columbus            515
Albuquerque         515
Indianapolis        515
Jacksonville        515
Hartford            515
Philadelphia        515
Baltimore           514
Little Rock         514
Dallas              513
Miami               513
Austin          

In [73]:
#Clean up the dataframe and keep only the desired fields
data_df = data_df[["Date","City","State","Lat","Lng","Population","AQI","Category","Defining Parameter"]]
data_df = data_df.rename(columns = {"Defining Parameter":"Defining_Parameter"})
data_df

Unnamed: 0,Date,City,State,Lat,Lng,Population,AQI,Category,Defining_Parameter
0,2019-01-01,Albuquerque,NM,35.08449,-106.65114,559121.0,42,Good,PM10
1,2019-01-02,Albuquerque,NM,35.08449,-106.65114,559121.0,40,Good,PM2.5
2,2019-01-03,Albuquerque,NM,35.08449,-106.65114,559121.0,55,Moderate,PM2.5
3,2019-01-04,Albuquerque,NM,35.08449,-106.65114,559121.0,57,Moderate,PM2.5
4,2019-01-05,Albuquerque,NM,35.08449,-106.65114,559121.0,66,Moderate,PM2.5
...,...,...,...,...,...,...,...,...,...
7321,5/25/2020,"Washington, D.C.",DC,38.89511,-77.03637,,39,Good,Ozone
7322,5/26/2020,"Washington, D.C.",DC,38.89511,-77.03637,,45,Good,Ozone
7323,5/27/2020,"Washington, D.C.",DC,38.89511,-77.03637,,33,Good,Ozone
7324,5/28/2020,"Washington, D.C.",DC,38.89511,-77.03637,,27,Good,PM2.5


In [77]:
#Get value counts of the cities to see where data discrepancies are
data_df["State"].value_counts()

CA     2190
TX     1825
FL     1093
 CA     900
TN      880
NC      880
OR      871
 TX     740
AZ      730
WI      730
IL      725
MN      516
NE      515
IN      515
NY      515
PA      515
RI      515
DC      515
MA      515
 FL     446
ID      365
OK      365
UT      365
CO      365
MS      365
AR      365
OH      365
MI      365
SC      365
MD      365
WA      365
VA      365
CT      365
NM      365
GA      365
 AZ     301
 WI     292
 VA     150
 TN     150
 OK     150
 ID     150
 IN     150
 MS     150
 MI     150
 UT     150
 OH     150
 NC     150
 SC     150
 NM     150
 CO     150
 GA     150
 CT     150
 OR     150
 WA     150
 AR     149
 MD     149
 IL     148
Name: State, dtype: int64

In [79]:
#Replaces any state data with a space in the row
data_df = data_df.replace({" CA":"CA"," TX": "TX"," FL": "FL"," AZ": "AZ"," WI":"WI"," VA":"VA"," TN":"TN"," OK":"OK"," ID":"ID"," IN":"IN"," MS":"MS"," MI":"MI"," UT":"UT"," OH":"OH"," NC":"NC"," SC":"SC"," NM":"NM"," CO":"CO"," GA":"GA"," CT":"CT"," OR":"OR"," WA":"WA"," AR":"AR"," MD":"MD"," IL":"IL"})

In [80]:
data_df["State"].value_counts()

CA    3090
TX    2565
FL    1539
AZ    1031
NC    1030
TN    1030
WI    1022
OR    1021
IL     873
IN     665
MN     516
CO     515
MS     515
OH     515
OK     515
SC     515
ID     515
NY     515
NE     515
VA     515
UT     515
GA     515
CT     515
WA     515
RI     515
DC     515
NM     515
MA     515
MI     515
PA     515
MD     514
AR     514
Name: State, dtype: int64

In [150]:
#Performs a forward fill to fill in empty population rows from 2020 with the 2019 values
data_df = data_df.fillna(method="ffill")
data_df

Unnamed: 0,Date,City,State,Lat,Lng,Population,AQI,Category,Defining_Parameter
0,2019-01-01,Albuquerque,NM,35.08449,-106.65114,559121.0,42,Good,PM10
1,2019-01-02,Albuquerque,NM,35.08449,-106.65114,559121.0,40,Good,PM2.5
2,2019-01-03,Albuquerque,NM,35.08449,-106.65114,559121.0,55,Moderate,PM2.5
3,2019-01-04,Albuquerque,NM,35.08449,-106.65114,559121.0,57,Moderate,PM2.5
4,2019-01-05,Albuquerque,NM,35.08449,-106.65114,559121.0,66,Moderate,PM2.5
...,...,...,...,...,...,...,...,...,...
7321,5/25/2020,"Washington, D.C.",DC,38.89511,-77.03637,601723.0,39,Good,Ozone
7322,5/26/2020,"Washington, D.C.",DC,38.89511,-77.03637,601723.0,45,Good,Ozone
7323,5/27/2020,"Washington, D.C.",DC,38.89511,-77.03637,601723.0,33,Good,Ozone
7324,5/28/2020,"Washington, D.C.",DC,38.89511,-77.03637,601723.0,27,Good,PM2.5


In [137]:
#Load policy dates into a dataframe
covid_df = pd.read_csv("assets/sample_data/covid19_policydates.csv")

covid_df.head()

Unnamed: 0,state,mass_gathering_restriction,inital_business_closure,educational_facilities_closure,non-essential_services_closure,stay_at_home_order,travel_severely_limited
0,Alabama,3/19/2020,3/19/2020,3/19/2020,3/28/2020,4/4/2020,
1,Alaska,3/24/2020,3/17/2020,3/16/2020,3/28/2020,3/28/2020,
2,Arizona,3/30/2020,,3/16/2020,,3/30/2020,
3,Arkansas,3/27/2020,3/19/2020,3/17/2020,,,
4,California,3/11/2020,3/19/2020,3/19/2020,3/19/2020,3/19/2020,


In [138]:
#Create list of abbreviations. NOTE: Maryland is missing from intial data. Will be added later.
abbr = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MA","MI","MN","MS","MO","MT","NE","NV"
        ,"NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
len(abbr)

50

In [139]:
#Add state abbreviations column to dataframe
covid_df["Abbr"] = abbr
covid_df.head(55)

Unnamed: 0,state,mass_gathering_restriction,inital_business_closure,educational_facilities_closure,non-essential_services_closure,stay_at_home_order,travel_severely_limited,Abbr
0,Alabama,3/19/2020,3/19/2020,3/19/2020,3/28/2020,4/4/2020,,AL
1,Alaska,3/24/2020,3/17/2020,3/16/2020,3/28/2020,3/28/2020,,AK
2,Arizona,3/30/2020,,3/16/2020,,3/30/2020,,AZ
3,Arkansas,3/27/2020,3/19/2020,3/17/2020,,,,AR
4,California,3/11/2020,3/19/2020,3/19/2020,3/19/2020,3/19/2020,,CA
5,Colorado,3/19/2020,3/17/2020,3/23/2020,3/26/2020,3/26/2020,,CO
6,Connecticut,3/12/2020,3/16/2020,3/17/2020,3/23/2020,,,CT
7,Delaware,3/16/2020,3/16/2020,3/16/2020,3/24/2020,3/24/2020,,DE
8,District of Columbia,3/13/2020,3/16/2020,3/16/2020,3/25/2020,3/30/2020,,DC
9,Florida,4/3/2020,3/17/2020,3/17/2020,,4/3/2020,,FL


In [140]:
#Select only the three relevant columns 
close_df = covid_df[["state","Abbr","inital_business_closure"]]
close_df

Unnamed: 0,state,Abbr,inital_business_closure
0,Alabama,AL,3/19/2020
1,Alaska,AK,3/17/2020
2,Arizona,AZ,
3,Arkansas,AR,3/19/2020
4,California,CA,3/19/2020
5,Colorado,CO,3/17/2020
6,Connecticut,CT,3/16/2020
7,Delaware,DE,3/16/2020
8,District of Columbia,DC,3/16/2020
9,Florida,FL,3/17/2020


In [141]:
#Add missing row to dataframe. Data for Maryland
mary_dict = {
    "state": "Maryland",
    "Abbr":"MD",
    "inital_business_closure":"03/16/2020"
}

close_df =close_df.append(mary_dict, ignore_index=True)

In [142]:
#Correct spelling on column
close_df = close_df.rename(columns={"inital_business_closure":"initial_business_closure"})
close_df.head(51)

Unnamed: 0,state,Abbr,initial_business_closure
0,Alabama,AL,3/19/2020
1,Alaska,AK,3/17/2020
2,Arizona,AZ,
3,Arkansas,AR,3/19/2020
4,California,CA,3/19/2020
5,Colorado,CO,3/17/2020
6,Connecticut,CT,3/16/2020
7,Delaware,DE,3/16/2020
8,District of Columbia,DC,3/16/2020
9,Florida,FL,3/17/2020


In [144]:
#Sort values by state to order aphabetically
close_df = close_df.sort_values("state")
close_df

Unnamed: 0,state,Abbr,initial_business_closure
0,Alabama,AL,3/19/2020
1,Alaska,AK,3/17/2020
2,Arizona,AZ,
3,Arkansas,AR,3/19/2020
4,California,CA,3/19/2020
5,Colorado,CO,3/17/2020
6,Connecticut,CT,3/16/2020
7,Delaware,DE,3/16/2020
8,District of Columbia,DC,3/16/2020
9,Florida,FL,3/17/2020


In [146]:
#Drop the full state column and change the Abbr column name to State for a dataframe merge
close_df = close_df[["Abbr","initial_business_closure"]]
close_df = close_df.rename(columns={"Abbr":"State"})
close_df.head()

Unnamed: 0,State,initial_business_closure
0,AL,3/19/2020
1,AK,3/17/2020
2,AZ,
3,AR,3/19/2020
4,CA,3/19/2020


In [151]:
#Merge closure date to the main dataframe
final_df = data_df.merge(close_df,how="inner",on="State")
final_df.head()

Unnamed: 0,Date,City,State,Lat,Lng,Population,AQI,Category,Defining_Parameter,initial_business_closure
0,2019-01-01,Albuquerque,NM,35.08449,-106.65114,559121.0,42,Good,PM10,3/16/2020
1,2019-01-02,Albuquerque,NM,35.08449,-106.65114,559121.0,40,Good,PM2.5,3/16/2020
2,2019-01-03,Albuquerque,NM,35.08449,-106.65114,559121.0,55,Moderate,PM2.5,3/16/2020
3,2019-01-04,Albuquerque,NM,35.08449,-106.65114,559121.0,57,Moderate,PM2.5,3/16/2020
4,2019-01-05,Albuquerque,NM,35.08449,-106.65114,559121.0,66,Moderate,PM2.5,3/16/2020


In [157]:
#Change the type of the date from string to datetime
final_df['Date'] = pd.to_datetime(final_df['Date'])
final_df.dtypes

Date                        datetime64[ns]
City                                object
State                               object
Lat                                float64
Lng                                float64
Population                         float64
AQI                                  int64
Category                            object
Defining_Parameter                  object
initial_business_closure            object
dtype: object

In [158]:
#Sort the dataframe by City and Date
final_df = final_df.sort_values(["City","Date"])


In [159]:
final_df.to_csv("sample_data.csv", index=False)