In [41]:
import json
import pandas as pd
import requests
from pprint import pprint
from pandas.io.json import json_normalize  

In [38]:
#Import CSV
aqi_df = pd.read_csv("assets/sample_data/daily_aqi_by_cbsa_2019.csv")

aqi_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,"Aberdeen, SD",10100,2019-01-03,4,Good,PM10,46-013-0003,1
1,"Aberdeen, SD",10100,2019-01-06,31,Good,PM2.5,46-013-0003,1
2,"Aberdeen, SD",10100,2019-01-09,9,Good,PM10,46-013-0003,1
3,"Aberdeen, SD",10100,2019-01-12,43,Good,PM2.5,46-013-0003,1
4,"Aberdeen, SD",10100,2019-01-15,28,Good,PM2.5,46-013-0003,1


In [39]:
#Split CBSA into City and State
aqi_df[['City','State']] = aqi_df.CBSA.str.split(",",expand=True)

aqi_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State
0,"Aberdeen, SD",10100,2019-01-03,4,Good,PM10,46-013-0003,1,Aberdeen,SD
1,"Aberdeen, SD",10100,2019-01-06,31,Good,PM2.5,46-013-0003,1,Aberdeen,SD
2,"Aberdeen, SD",10100,2019-01-09,9,Good,PM10,46-013-0003,1,Aberdeen,SD
3,"Aberdeen, SD",10100,2019-01-12,43,Good,PM2.5,46-013-0003,1,Aberdeen,SD
4,"Aberdeen, SD",10100,2019-01-15,28,Good,PM2.5,46-013-0003,1,Aberdeen,SD


In [None]:
# URL for JSON with cities and geo data
url = "https://aqicn.org/data-platform/covid19/airquality-covid19-cities.json"

In [24]:
#Read JSON into a dataframe
df = pd.read_json(url)

df.head()

Unnamed: 0,csvsize,data,generated
0,46726470,"{'Place': {'geo': [38.53575, 68.77905], 'featu...",2020-05-28T20:54:02+01:00
1,46726470,"{'Place': {'geo': [24.45118, 54.39696], 'featu...",2020-05-28T20:54:02+01:00
2,46726470,"{'Place': {'geo': [25.07725, 55.30927], 'featu...",2020-05-28T20:54:02+01:00
3,46726470,"{'Place': {'geo': [60.29414, 25.04099], 'featu...",2020-05-28T20:54:02+01:00
4,46726470,"{'Place': {'geo': [60.45148, 22.26869], 'featu...",2020-05-28T20:54:02+01:00


In [49]:
# Uses requests.get() to load JSON into text
response = json.loads(requests.get(url).text)

#Flattens the nested JSON of the 'data' column into its own DataFrame to get the Places geo data
geo_df = pd.io.json.json_normalize(response['data'])
#pprint(response)
geo_df.head()

Unnamed: 0,Sources,Stations,Place.geo,Place.feature,Place.name,Place.country,Place.pop
0,[{'name': 'World Meteorological Organization -...,"[{'Name': 'Dushanbe US Embassy, Tajikistan'}]","[38.53575, 68.77905]",PPLC,Dushanbe,TJ,679400
1,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Al Mafraq, UAE'}, {'Name': 'Hamdan ...","[24.45118, 54.39696]",PPLC,Abu Dhabi,AE,603492
2,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Dubai Mushrif Park, UAE'}, {'Name':...","[25.07725, 55.30927]",PPLA,Dubai,AE,2956587
3,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Tikkurila 3, Vantaa, Finland'}, {'N...","[60.29414, 25.04099]",PPLA3,Vantaa,FI,190058
4,[{'name': 'Citizen Weather Observer Program (C...,"[{'Name': 'Oriketo, Turku, Finland'}, {'Name':...","[60.45148, 22.26869]",PPLA,Turku,FI,175945


In [50]:
#Puts the geo data and name into a dataframe
places_df = geo_df[['Place.geo','Place.name']]

In [51]:
places_df.head()

Unnamed: 0,Place.geo,Place.name
0,"[38.53575, 68.77905]",Dushanbe
1,"[24.45118, 54.39696]",Abu Dhabi
2,"[25.07725, 55.30927]",Dubai
3,"[60.29414, 25.04099]",Vantaa
4,"[60.45148, 22.26869]",Turku


In [56]:
#Rename columns for merging purposes
places_df = places_df.rename(columns = {'Place.geo':'Geo','Place.name':'City'})
places_df.head()

Unnamed: 0,Geo,City
0,"[38.53575, 68.77905]",Dushanbe
1,"[24.45118, 54.39696]",Abu Dhabi
2,"[25.07725, 55.30927]",Dubai
3,"[60.29414, 25.04099]",Vantaa
4,"[60.45148, 22.26869]",Turku


In [57]:
#Inner merge
new_df = aqi_df.merge(places_df,on="City",how="inner")

new_df.head()

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State,Geo
0,"Albuquerque, NM",10740,2019-01-01,42,Good,PM10,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]"
1,"Albuquerque, NM",10740,2019-01-02,40,Good,PM2.5,35-001-1012,8,Albuquerque,NM,"[35.08449, -106.65114]"
2,"Albuquerque, NM",10740,2019-01-03,55,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]"
3,"Albuquerque, NM",10740,2019-01-04,57,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]"
4,"Albuquerque, NM",10740,2019-01-05,66,Moderate,PM2.5,35-001-0029,8,Albuquerque,NM,"[35.08449, -106.65114]"


In [69]:
#Outer merge
new_df2 = aqi_df.merge(places_df,on="City",how="outer")

new_df2

Unnamed: 0,CBSA,CBSA Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,City,State,Geo
0,"Aberdeen, SD",10100.0,2019-01-03,4.0,Good,PM10,46-013-0003,1.0,Aberdeen,SD,
1,"Aberdeen, SD",10100.0,2019-01-06,31.0,Good,PM2.5,46-013-0003,1.0,Aberdeen,SD,
2,"Aberdeen, SD",10100.0,2019-01-09,9.0,Good,PM10,46-013-0003,1.0,Aberdeen,SD,
3,"Aberdeen, SD",10100.0,2019-01-12,43.0,Good,PM2.5,46-013-0003,1.0,Aberdeen,SD,
4,"Aberdeen, SD",10100.0,2019-01-15,28.0,Good,PM2.5,46-013-0003,1.0,Aberdeen,SD,
...,...,...,...,...,...,...,...,...,...,...,...
177570,,,,,,,,,Kabul,,"[34.52813, 69.17233]"
177571,,,,,,,,,Kaunas,,"[54.90272, 23.90961]"
177572,,,,,,,,,Kathmandu,,"[27.70169, 85.3206]"
177573,,,,,,,,,Pokhara,,"[28.26689, 83.96851]"


In [66]:
#Store data to CSV for verification
new_df.to_csv("sample_data.csv")

In [64]:
#Split up the location data into latitude and longitude
neww_df = pd.DataFrame(new_df['Geo'].to_list(), columns =['Lat','Lng'])

neww_df

Unnamed: 0,Lat,Lng
0,35.08449,-106.65114
1,35.08449,-106.65114
2,35.08449,-106.65114
3,35.08449,-106.65114
4,35.08449,-106.65114
...,...,...
13364,-33.64651,19.44852
13365,-33.64651,19.44852
13366,-33.64651,19.44852
13367,-33.64651,19.44852
