In [1]:
import pandas as pd
import numpy as np
import requests
import json
from geojson import Point, Feature, FeatureCollection, dump
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime
from datetime import timedelta

# set the rows of dataframe to be displayed, e.g 10 (default is 30+30=60 rows)
pd.set_option('display.max_rows', 14)
# set the columns of dataframe to be displayed, e.g 'None' - display all the columns
pd.set_option('display.max_columns', None)
# set the decimal places of number, e.g 2
# pd.set_option('display.precision',4)

In [2]:
# function to request through API and get response as a JSON file
# require requests & json package
def getJSONR(url): 
    try:
        ua = {'user-agent':'Chrome/71.0.3578.98'}
        r = requests.get(url, headers = ua, timeout=30)
        r.raise_for_status() # raise error if r.status_code != 200
        r.encoding = r.apparent_encoding
        js = json.loads(r.text)
        return js
    except Exception as e:
        return e

# functions to get air quality data from the JSON response of World's Air Pollution API
def getID(js):
    try:
        id = js['data']['idx']
        return id
    except Exception as e:
        return np.nan
    
def getSTATUS(js):
    try:
        status = js['status']
        return status
    except Exception as e:
        return np.nan

def getNAME(js):
    try:
        name = js['data']['city']['name']
        return name
    except Exception as e:
        return np.nan
    
def getGEO(js):
    try:
        geo = []
        lat = js['data']['city']['geo'][0]
        lng = js['data']['city']['geo'][1]
        geo.append(lat)
        geo.append(lng)
        return geo
    except Exception as e:
        geo = [np.nan, np.nan]
        return geo
    
def getTIME(js):
    try:
        time = js['data']['time']['s']
        return time
    except Exception as e:
        return np.nan
    
def getAQI(js):
    try:
        aqi = js['data']['aqi']
        return aqi
    except Exception as e:
        return np.nan

def getPM10(js):
    try:
        pm10 = js['data']['iaqi']['pm10']['v']
        return pm10
    except Exception as e:
        return np.nan

def getPM25(js):
    try:
        pm25 = js['data']['iaqi']['pm25']['v']
        return pm25
    except Exception as e:
        return np.nan

def getNO2(js):
    try:
        no2 = js['data']['iaqi']['no2']['v']
        return no2
    except Exception as e:
        return np.nan

def getO3(js):
    try:
        o3 = js['data']['iaqi']['o3']['v']
        return o3
    except Exception as e:
        return np.nan
    
def getSO2(js):
    try:
        so2 = js['data']['iaqi']['so2']['v']
        return so2
    except Exception as e:
        return np.nan

def getCO(js):
    try:
        co = js['data']['iaqi']['co']['v']
        return co
    except Exception as e:
        return np.nan

In [3]:
# initialise the dataframe to store AQI data
data = pd.DataFrame(columns=['id','status','station','station_lat','station_lng','time','aqi','pm10','pm25','no2','o3','so2','co'])
# initialise a list to store coordinates to check duplicates
coords = []
for i in range(0,11772):#11772
    try:
        url = 'https://api.waqi.info/feed/'\
                +'@'+str(i)\
                +'/?token=c4103eeb230f7e9cd437f3ea4a2fe03216563895'
        js = getJSONR(url)
        # if getSTATUS(js) returns np.nan, re-request the API until it returns valid response
        while not isinstance(getSTATUS(js),str):
            js = getJSONR(url)
        data.loc[i,'id'] = getID(js)
        data.loc[i,'status'] = getSTATUS(js)
        data.loc[i,'station'] = getNAME(js)
        data.loc[i,'station_lat'] = getGEO(js)[0]
        data.loc[i,'station_lng'] = getGEO(js)[1]
        data.loc[i,'time'] = getTIME(js)
        data.loc[i,'aqi'] = getAQI(js)
        data.loc[i,'pm10'] = getPM10(js)
        data.loc[i,'pm25'] = getPM25(js)
        data.loc[i,'no2'] = getNO2(js)
        data.loc[i,'o3'] = getO3(js)
        data.loc[i,'so2'] = getSO2(js)
        data.loc[i,'co'] = getCO(js)
    except Exception as e:
        print(e)
        continue
print(datetime.now())

2019-05-20 09:17:41.753541


In [4]:
data

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co
0,0,ok,"Barrie, Ontario, Canada",44.3824,-79.7023,2019-05-20 00:00:00,37,,34,1.4,36.8,,
1,1,ok,"Belleville, Ontario, Canada",44.1505,-77.3955,2019-05-20 00:00:00,59,,59,1.6,28.9,,
2,2,ok,"Brampton, Ontario, Canada",43.6987,-79.7809,2019-05-20 00:00:00,34,,17,3.6,34.4,0.2,2.7
3,3,ok,"Brantford, Ontario, Canada",43.1386,-80.2926,2019-05-20 00:00:00,40,,13,1.1,40,0.6,1.4
4,4,ok,"Burlington, Ontario, Canada",43.3151,-79.8026,2019-05-19 23:00:00,57,,57,7.1,35.2,1.2,1.4
5,5,ok,"Chatham, Ontario, Canada",42.4037,-82.2083,2019-05-20 00:00:00,35,,25,2,35.2,,
6,6,ok,"Cornwall, Ontario, Canada",45.018,-74.7352,2019-05-19 23:00:00,34,,34,3.4,17.6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11765,11765,ok,"Lahore US Embassy, Pakistan",31.5601,74.3359,2019-05-20 12:00:00,168,,168,,,,
11766,11766,ok,"Hounslow Brentford, United Kingdom",51.4894,-0.310081,2019-05-20 07:00:00,20,20,68,22.4,10.8,0.1,2.9


In [5]:
data.isnull().sum()

id              433
status            0
station         434
station_lat     719
station_lng     719
time            433
aqi             433
pm10           2440
pm25           2593
no2            2662
o3             3028
so2            3623
co             4816
dtype: int64

In [6]:
data['status'].value_counts()

ok      11755
nope       17
Name: status, dtype: int64

In [7]:
data2 = data.dropna(subset=['station_lat','station_lng'])
data2 = data2.reset_index()
data2 = data2.drop(columns=['index'])
data2

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co
0,0,ok,"Barrie, Ontario, Canada",44.3824,-79.7023,2019-05-20 00:00:00,37,,34,1.4,36.8,,
1,1,ok,"Belleville, Ontario, Canada",44.1505,-77.3955,2019-05-20 00:00:00,59,,59,1.6,28.9,,
2,2,ok,"Brampton, Ontario, Canada",43.6987,-79.7809,2019-05-20 00:00:00,34,,17,3.6,34.4,0.2,2.7
3,3,ok,"Brantford, Ontario, Canada",43.1386,-80.2926,2019-05-20 00:00:00,40,,13,1.1,40,0.6,1.4
4,4,ok,"Burlington, Ontario, Canada",43.3151,-79.8026,2019-05-19 23:00:00,57,,57,7.1,35.2,1.2,1.4
5,5,ok,"Chatham, Ontario, Canada",42.4037,-82.2083,2019-05-20 00:00:00,35,,25,2,35.2,,
6,6,ok,"Cornwall, Ontario, Canada",45.018,-74.7352,2019-05-19 23:00:00,34,,34,3.4,17.6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11046,11765,ok,"Lahore US Embassy, Pakistan",31.5601,74.3359,2019-05-20 12:00:00,168,,168,,,,
11047,11766,ok,"Hounslow Brentford, United Kingdom",51.4894,-0.310081,2019-05-20 07:00:00,20,20,68,22.4,10.8,0.1,2.9


In [8]:
data2.loc[:,'time'] = pd.to_datetime(data2.loc[:,'time'], format='%Y-%m-%d %H:%M:%S')
data2

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co
0,0,ok,"Barrie, Ontario, Canada",44.3824,-79.7023,2019-05-20 00:00:00,37,,34,1.4,36.8,,
1,1,ok,"Belleville, Ontario, Canada",44.1505,-77.3955,2019-05-20 00:00:00,59,,59,1.6,28.9,,
2,2,ok,"Brampton, Ontario, Canada",43.6987,-79.7809,2019-05-20 00:00:00,34,,17,3.6,34.4,0.2,2.7
3,3,ok,"Brantford, Ontario, Canada",43.1386,-80.2926,2019-05-20 00:00:00,40,,13,1.1,40,0.6,1.4
4,4,ok,"Burlington, Ontario, Canada",43.3151,-79.8026,2019-05-19 23:00:00,57,,57,7.1,35.2,1.2,1.4
5,5,ok,"Chatham, Ontario, Canada",42.4037,-82.2083,2019-05-20 00:00:00,35,,25,2,35.2,,
6,6,ok,"Cornwall, Ontario, Canada",45.018,-74.7352,2019-05-19 23:00:00,34,,34,3.4,17.6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11046,11765,ok,"Lahore US Embassy, Pakistan",31.5601,74.3359,2019-05-20 12:00:00,168,,168,,,,
11047,11766,ok,"Hounslow Brentford, United Kingdom",51.4894,-0.310081,2019-05-20 07:00:00,20,20,68,22.4,10.8,0.1,2.9


In [9]:
data2['time'].describe()

count                   10641
unique                    837
top       2019-05-20 12:00:00
freq                     1282
first     1970-01-01 00:00:00
last      2019-05-20 20:00:00
Name: time, dtype: object

In [10]:
data2 = data2.loc[(data2['time']>=datetime.now()-timedelta(hours=19)) & (data2['time']<=datetime.now()+timedelta(hours=12))]
data2 = data2.reset_index()
data2 = data2.drop(columns=['index'])
data2

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co
0,0,ok,"Barrie, Ontario, Canada",44.3824,-79.7023,2019-05-20 00:00:00,37,,34,1.4,36.8,,
1,1,ok,"Belleville, Ontario, Canada",44.1505,-77.3955,2019-05-20 00:00:00,59,,59,1.6,28.9,,
2,2,ok,"Brampton, Ontario, Canada",43.6987,-79.7809,2019-05-20 00:00:00,34,,17,3.6,34.4,0.2,2.7
3,3,ok,"Brantford, Ontario, Canada",43.1386,-80.2926,2019-05-20 00:00:00,40,,13,1.1,40,0.6,1.4
4,4,ok,"Burlington, Ontario, Canada",43.3151,-79.8026,2019-05-19 23:00:00,57,,57,7.1,35.2,1.2,1.4
5,5,ok,"Chatham, Ontario, Canada",42.4037,-82.2083,2019-05-20 00:00:00,35,,25,2,35.2,,
6,6,ok,"Cornwall, Ontario, Canada",45.018,-74.7352,2019-05-19 23:00:00,34,,34,3.4,17.6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9116,11765,ok,"Lahore US Embassy, Pakistan",31.5601,74.3359,2019-05-20 12:00:00,168,,168,,,,
9117,11766,ok,"Hounslow Brentford, United Kingdom",51.4894,-0.310081,2019-05-20 07:00:00,20,20,68,22.4,10.8,0.1,2.9


In [11]:
data2['time'].describe()

count                    9123
unique                     30
top       2019-05-20 12:00:00
freq                     1282
first     2019-05-19 15:00:00
last      2019-05-20 20:00:00
Name: time, dtype: object

In [12]:
for i in range(data2.shape[0]):
    data2.loc[i,'coord'] = str(data2.loc[i,'station_lat'])+','+str(data2.loc[i,'station_lng'])
data2

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co,coord
0,0,ok,"Barrie, Ontario, Canada",44.3824,-79.7023,2019-05-20 00:00:00,37,,34,1.4,36.8,,,"44.382361,-79.702306"
1,1,ok,"Belleville, Ontario, Canada",44.1505,-77.3955,2019-05-20 00:00:00,59,,59,1.6,28.9,,,"44.150528,-77.3955"
2,2,ok,"Brampton, Ontario, Canada",43.6987,-79.7809,2019-05-20 00:00:00,34,,17,3.6,34.4,0.2,2.7,"43.69875,-79.780917"
3,3,ok,"Brantford, Ontario, Canada",43.1386,-80.2926,2019-05-20 00:00:00,40,,13,1.1,40,0.6,1.4,"43.138611,-80.292639"
4,4,ok,"Burlington, Ontario, Canada",43.3151,-79.8026,2019-05-19 23:00:00,57,,57,7.1,35.2,1.2,1.4,"43.315111,-79.802639"
5,5,ok,"Chatham, Ontario, Canada",42.4037,-82.2083,2019-05-20 00:00:00,35,,25,2,35.2,,,"42.403694,-82.208306"
6,6,ok,"Cornwall, Ontario, Canada",45.018,-74.7352,2019-05-19 23:00:00,34,,34,3.4,17.6,,,"45.017972,-74.735222"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9116,11765,ok,"Lahore US Embassy, Pakistan",31.5601,74.3359,2019-05-20 12:00:00,168,,168,,,,,"31.560078,74.33589"
9117,11766,ok,"Hounslow Brentford, United Kingdom",51.4894,-0.310081,2019-05-20 07:00:00,20,20,68,22.4,10.8,0.1,2.9,"51.489398,-0.310081"


In [13]:
data2['coord'].value_counts()

36.204824,138.252924        8
34.7466,113.625368          4
43.3613953,-5.8593267       4
35.7996715,139.4686101      3
40.842299,111.749138        3
44.933333,26.033333         3
39.32815,117.8165667        2
                           ..
34.853106,118.049933        1
43.8694,125.325             1
44.002896,19.905235         1
36.70194444,119.14          1
36.71627836,119.17448756    1
44.920081,2.441083          1
37.537745,126.63879         1
Name: coord, Length: 9002, dtype: int64

In [14]:
data2.loc[data2['coord'] == '36.204824,138.252924']

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co,coord
7581,9951,ok,"甲府市/甲府市役所自排, Japan (日本甲府市役所自排甲府市)",36.2048,138.253,2019-05-20 15:00:00,42,12,42,4.7,33.6,1.5,2.3,"36.204824,138.252924"
7587,9957,ok,"宮崎市/大宮小学校自動車排出ガス測定局, Japan (日本大宮小学校自動車排出ガス測定局宮崎市)",36.2048,138.253,2019-05-20 15:00:00,25,14,25,6.5,33.6,1.5,1.2,"36.204824,138.252924"
7591,9961,ok,"倉敷市/庄, Japan (日本庄倉敷市)",36.2048,138.253,2019-05-20 15:00:00,42,10,42,3.8,32.0,1.5,2.3,"36.204824,138.252924"
7594,9964,ok,"香美市/土佐山田, Japan (日本土佐山田香美市)",36.2048,138.253,2019-05-20 14:00:00,13,10,13,3.8,32.0,1.5,2.3,"36.204824,138.252924"
7595,9965,ok,"大阪市中央区/国設大阪, Japan (日本国設大阪大阪市中央区)",36.2048,138.253,2019-05-20 15:00:00,6,6,5,10.2,32.8,1.5,2.3,"36.204824,138.252924"
7596,9966,ok,"堺市北区/金岡南, Japan (日本金岡南堺市北区)",36.2048,138.253,2019-05-20 15:00:00,13,5,13,4.7,35.2,4.3,2.3,"36.204824,138.252924"
7597,9967,ok,"守口市/西部コミュニティーセンター, Japan (日本西部コミュニティーセンター守口市)",36.2048,138.253,2019-05-20 15:00:00,25,7,25,5.6,37.6,1.5,2.3,"36.204824,138.252924"
7598,9968,ok,"東大阪市/東大阪市六万寺（仮設）, Japan (日本東大阪市六万寺（仮設）東大阪市)",36.2048,138.253,2019-05-20 15:00:00,13,10,13,1.9,37.6,1.5,2.3,"36.204824,138.252924"


In [15]:
data2.loc[data2['coord'].duplicated()].shape[0]

121

In [16]:
# sort the dataframe by time
data2_sorted = data2.sort_values(by=['time'])
data2_sorted = data2_sorted.reset_index()
data2_sorted = data2_sorted.drop(columns=['index'])
# drop the duplicates in column 'coord'  and 'station', and only keep the last (latest) data
data2_sorted.drop_duplicates(subset='coord', keep='last', inplace=True)
data2_sorted.drop_duplicates(subset='station', keep='last', inplace=True)
data2_sorted = data2_sorted.reset_index()
data2_sorted = data2_sorted.drop(columns=['index'])
data2_sorted

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co,coord
0,9186,ok,"Laboratorio de Salud, Morelia, MichoacÃ¡n, Mexico",19.692,-101.155,2019-05-19 15:00:00,38,33,38,2.8,,2.9,14,"19.691977777778,-101.15547222222"
1,150,ok,"Alexander Ave, Tacoma, Washington, USA",47.2656,-122.385,2019-05-19 15:00:00,21,,21,13,,,,"47.2656,-122.385"
2,11202,ok,"Debaratana Hospital, Thailand (รพ. เทพรัตนฯ)",18.4982,98.3795,2019-05-19 15:00:00,76,56,76,,,,,"18.498206,98.379513"
3,8301,ok,"Carus - Spangler Rd., Oregon, USA",45.2593,-122.588,2019-05-19 15:00:00,36,9,12,8.4,36.1,0.3,2.3,"45.25928,-122.588151"
4,11319,ok,"Moti Doongri, Alwar, Rajasthan, India",27.5548,76.6115,2019-05-19 16:00:00,161,77,161,10.7,35.4,6.9,4.4,"27.554793,76.611536"
5,11231,ok,"Borazjan, Dashtestan, Bushehr, Iran (بوشهر دشت...",29.2704,51.217,2019-05-19 16:00:00,71,71,6,,,,,"29.2703572,51.2169838"
6,6036,ok,"Strabane Springhill Park, United Kingdom",54.8214,-7.45329,2019-05-19 16:00:00,35,11,34,1.4,34.6,8.2,,"54.821427,-7.453291"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8983,8521,ok,"Wairarapa College, New Zealand",-40.9523,175.647,2019-05-20 19:00:00,51,24,51,,,,,"-40.95230652,175.64661442"
8984,8523,ok,"Wainuiomata Bowling Club, New Zealand",-41.2677,174.954,2019-05-20 19:00:00,59,28,59,,,,,"-41.26771143,174.95375106"


In [17]:
data2_sorted.loc[data2_sorted['coord'] == '36.204824,138.252924']

Unnamed: 0,id,status,station,station_lat,station_lng,time,aqi,pm10,pm25,no2,o3,so2,co,coord
8687,9951,ok,"甲府市/甲府市役所自排, Japan (日本甲府市役所自排甲府市)",36.2048,138.253,2019-05-20 15:00:00,42,12,42,4.7,33.6,1.5,2.3,"36.204824,138.252924"


In [18]:
data2_sorted.isnull().sum()

id                0
status            0
station           0
station_lat       0
station_lng       0
time              0
aqi               0
pm10           1046
pm25           1185
no2            1205
o3             1453
so2            1908
co             2948
coord             0
dtype: int64

In [19]:
data2_sorted['status'].value_counts().sort_values(ascending=False)

ok    8990
Name: status, dtype: int64

In [20]:
data2_sorted['station'].value_counts().sort_values(ascending=False)

Nanba, Guangyuan (广元南坝)                                        1
London Harrow Stanmore, United Kingdom                         1
Carrithers Middle School, Kentucky, USA                        1
Heritage Middle School, San Antonio, Texas                     1
Perai                                                          1
Indera Mahkota, Kuantan, Pahang, Malaysia                      1
Altau, Freizeitpark, Am See 2, Austria                         1
                                                              ..
Greenbluff Rd, Colbert, Washington, USA                        1
Bagongshan District Government, Huainan (淮南八公山区政府)             1
Brent - ARK Franklin Primary Academy, United Kingdom           1
Xindu Local Taxation Bureau, Chengdu (成都新都地税局)                 1
Hata, Fukuoka, Japan (日本松ヶ江観測局北九州市門司区)                         1
Hosu-dong, Ansan-si, Gyeonggi, South Korea (대한민국안산시 호수동 경기)    1
Petrochemical General Plant, Jiujiang (九江石化总厂)                 1
Name: station, Length: 89

In [21]:
data2_sorted['coord'].value_counts().sort_values(ascending=False)

43.1594,124.3711                      1
49.5052795400543,5.976675030211347    1
50.662979166667,14.031243333333       1
39.084158,117.200983                  1
2.723381,101.968497                   1
-35.1377882,138.4983611               1
43.895,81.2867                        1
                                     ..
-34.143888375919,-70.737131207664     1
34.3181,108.6761                      1
30.9414,117.7806                      1
35.9066232,139.6300538                1
12.920984,77.584908                   1
19.357357,-99.262865                  1
32.861139,51.550008                   1
Name: coord, Length: 8990, dtype: int64

In [22]:
stationIDs = data2_sorted[['id']]
stationIDs

Unnamed: 0,id
0,9186
1,150
2,11202
3,8301
4,11319
5,11231
6,6036
...,...
8983,8521
8984,8523


In [23]:
data2_sorted.to_csv('stationIDs_data.csv',index=False)
stationIDs.to_csv('stationIDs.csv',index=False)