In [4]:
import pandas as pd
import requests
import json
import time
import openaq 
from fuzzywuzzy import fuzz


DATA_PATH="C:\\Projets\\WeatherData\\POC\\"

# load API keys from a file in folder .env, a line with the key name and the key value separated by a = sign
def load_api_keys():
    api_keys = {}
    with open(".env\\API_KEYS.cfg") as f:
        for line in f:
            key, val = line.split("=")
            key = key.strip()
            val = val.strip()            
            api_keys[key] = val
    return api_keys

api_keys = load_api_keys()



Free API for geocoding : 
https://openweathermap.org/api/geocoding-api  
https://overpass-api.de/

https://geocode.maps.co/ >> https://geocode.maps.co/search?street=555+5th+Ave&city=New+York&state=NY&postalcode=10017&country=US&api_key=api_key
1 req / sec max 100k / month

In [2]:
# geocoding with geocode.maps.co
city="BORDEAUX"
country="FR"

url_geo="https://geocode.maps.co/search?city="+city+"&country="+country+"&api_key="+api_keys["GEOCODE_MAPS_CO"]

response = requests.get(url_geo)
# get first element in response
geo_data = response.json()[0]
# get display_name, lat, lon and importance
display_name = geo_data["display_name"]
lat = geo_data["lat"]
lon = geo_data["lon"]
importance = geo_data["importance"]

print(f"{display_name} : lat={lat}, lon={lon}, importance={importance}")
if importance<0.5:
    print("Warning: importance is low, the location may be incorrect")


Bordeaux, Gironde, Nouvelle-Aquitaine, Metropolitan France, France : lat=44.841225, lon=-0.5800364, importance=0.8740050666982948


Get air quality whith https://openaq.org/
1-get station : https://api.openaq.org/v2/locations?
2- get measurements : https://api.openaq.org/v2/measurements

and write in a file

In [None]:
radius="20000"

url_rech = "https://api.openaq.org/v2/locations?country="+country+"&coordinates="+lat+","+lon+"&radius="+radius+"&limit=20&page=1&offset=0&sort=asc&order_by=city"

headers = {"accept": "application/json"}

response = requests.get(url_rech, headers=headers)

json_file_path = DATA_PATH +city+ "_response.json"

# Write the response.text to the JSON file
with open(json_file_path, "w",encoding='utf-8') as file:
    # json.dump(response.text, file,ensure_ascii=False,indent=4)
    file.write(response.text)



parse the response to build a dataframe

In [15]:
response_json = response.json()
results = response_json["results"]

# create a DataFrame with interesting columns 
df = pd.DataFrame(results, columns=["id", "city", "country", "firstUpdated", "lastUpdated", "measurements", "coordinates", "parameters"])
df["latitude"] = df["coordinates"].apply(lambda x: x["latitude"])
df["longitude"] = df["coordinates"].apply(lambda x: x["longitude"])
df = df.drop(columns=["coordinates"])

#  filter dataframe to get  the oldest firstUpdated and the biggest measurements
df["firstUpdated"] = pd.to_datetime(df["firstUpdated"])
df = df.sort_values(by=["firstUpdated","measurements"], ascending=[True,False],)
df = df.reset_index(drop=True)
df = df.drop(df.index[1:])
print(df)

     id                     city country              firstUpdated  \
0  3572  ATMO NOUVELLE-AQUITAINE      FR 2016-11-21 11:00:00+00:00   

                 lastUpdated  measurements  \
0  2024-03-04T07:00:00+00:00        211132   

                                          parameters   latitude  longitude  
0  [{'id': 1, 'unit': 'µg/m³', 'count': 44602, 'a...  44.900276  -0.514722  


call averages and loop with the pagination and write responses in a file

In [25]:
location_id=str(df["id"][0])
date_from="2024-01-01T00:00:00Z"
date_to="2024-04-30T00:00:00Z"
limit="200"
# warning if more than 1 year timeout

# url_rech = "https://api.openaq.org/v2/measurements?date_from=2023-01-01T00%3A00%3A00Z&date_to=2024-04-30T00%3A00%3A00Z&location_id=3572&limit=100&page=1&offset=0&sort=desc&order_by=datetime"
# url_rech = "https://api.openaq.org/v2/averages?temporal=day&date_from="+date_from+"&date_to="+date_to+"&locations_id="+location_id+"&spatial=location&limit="+limit+"&page=1&offset=0&sort=desc&order_by=datetime"

headers = {"accept": "application/json"}

page = 1
more=True

# Create an empty list to store the results
all_results = []

while more:
    # Construct the API URL with the current page and limit
    url_rech = f"https://api.openaq.org/v2/averages?temporal=day&date_from={date_from}&date_to={date_to}&locations_id={location_id}&spatial=location&limit={limit}&page={page}"
    print(url_rech)
    # Send the request to the API
    response = requests.get(url_rech)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the JSON data from the response
        data = response.json()

        # Get the results from the JSON data
        results = data["results"]

        # Append the results to the list
        all_results.extend(results)

        # Check if there are more pages
        if len(results)>0:
            # Increment the page number
            page += 1
            # sleep 1 second to avoid being blocked
            time.sleep(1)
        else:
            # Break the loop if there are no more pages
            more=False
    else:
        # Print an error message if the request was not successful
        print(f"Error: {response.status_code}")

# Print the total number of results
print(f"Total results: {len(all_results)}")

# response = requests.get(url_rech, headers=headers)

json_file_path = DATA_PATH +city+ "_avg_bdx_response.json"

with open(json_file_path, "w",encoding='utf-8') as file:
    # json.dump(response.text, file,ensure_ascii=False,indent=4)
    # file.write(response.text)
    file.write(json.dumps(all_results,ensure_ascii=False,indent=4))


https://api.openaq.org/v2/averages?temporal=day&date_from=2024-01-01T00:00:00Z&date_to=2024-04-30T00:00:00Z&locations_id=3572&spatial=location&limit=200&page=1
https://api.openaq.org/v2/averages?temporal=day&date_from=2024-01-01T00:00:00Z&date_to=2024-04-30T00:00:00Z&locations_id=3572&spatial=location&limit=200&page=2
https://api.openaq.org/v2/averages?temporal=day&date_from=2024-01-01T00:00:00Z&date_to=2024-04-30T00:00:00Z&locations_id=3572&spatial=location&limit=200&page=3
Total results: 204


Same but with the Python wrapper OpenAQ

In [28]:
openaq_key=api_keys["OPENAQ"]
location_id=str(df["id"][0])
date_from="2024-01-01"
date_to="2024-04-30"

client = OpenAQ(api_key=openaq_key)
response = client.measurements.list(locations_id=location_id, date_from=date_from, date_to=date_to)
data_dict = response.dict()
df = pd.json_normalize(data_dict['results'])
client.close()
print(df.head())

   value coordinates period.label period.interval   period.datetime_from.utc  \
0    7.3        None        1hour        01:00:00  2023-12-31T23:00:00+00:00   
1   15.0        None        1hour        01:00:00  2023-12-31T23:00:00+00:00   
2   22.0        None        1hour        01:00:00  2023-12-31T23:00:00+00:00   
3   61.0        None        1hour        01:00:00  2023-12-31T23:00:00+00:00   
4    0.4        None        1hour        01:00:00  2023-12-31T23:00:00+00:00   

  period.datetime_from.local     period.datetime_to.utc  \
0  2024-01-01T00:00:00+01:00  2024-01-01T00:00:00+00:00   
1  2024-01-01T00:00:00+01:00  2024-01-01T00:00:00+00:00   
2  2024-01-01T00:00:00+01:00  2024-01-01T00:00:00+00:00   
3  2024-01-01T00:00:00+01:00  2024-01-01T00:00:00+00:00   
4  2024-01-01T00:00:00+01:00  2024-01-01T00:00:00+00:00   

    period.datetime_to.local  parameter.id parameter.name  ...  \
0  2024-01-01T01:00:00+01:00             5            no2  ...   
1  2024-01-01T01:00:00+01:00    

Get data from NOAA
Bordeaux station: FR000007510

In [2]:
file_station="ghcnd-stations.txt"

import pandas as pd

file_station = "ghcnd-stations.txt"
df_stations = pd.read_fwf(DATA_PATH+file_station, header=None,widths=[11,9,10,7,4,31,4,4,5], names=["station_id", "latitude", "longitude", "elevation", "state", "name", "gsn_flag", "hcn_crn_flag", "wmo_id"])
print(df_stations)

         station_id  latitude  longitude  elevation state  \
0       ACW00011604   17.1167   -61.7833       10.1   NaN   
1       ACW00011647   17.1333   -61.7833       19.2   NaN   
2       AE000041196   25.3330    55.5170       34.0   NaN   
3       AEM00041194   25.2550    55.3640       10.4   NaN   
4       AEM00041217   24.4330    54.6510       26.8   NaN   
...             ...       ...        ...        ...   ...   
125983  ZI000067969  -21.0500    29.3670      861.0   NaN   
125984  ZI000067975  -20.0670    30.8670     1095.0   NaN   
125985  ZI000067977  -21.0170    31.5830      430.0   NaN   
125986  ZI000067983  -20.2000    32.6160     1132.0   NaN   
125987  ZI000067991  -22.2170    30.0000      457.0   NaN   

                         name gsn_flag hcn_crn_flag   wmo_id  
0       ST JOHNS COOLIDGE FLD      NaN          NaN      NaN  
1                    ST JOHNS      NaN          NaN      NaN  
2         SHARJAH INTER. AIRP      GSN          NaN  41196.0  
3              

In [3]:
from fuzzywuzzy import fuzz
# TODO
# UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
#   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')

city="BORDEAUX"
country="FR"


# Filter df_stations by station_id starting with country
filtered_stations = df_stations[df_stations['station_id'].str.startswith(country)].copy()

# Calculate the Levenshtein distance between each city name and the country variable
filtered_stations['distance'] = filtered_stations['name'].apply(lambda x: fuzz.ratio(x, city))

# # Find the city name with the minimum distance
closest_city = filtered_stations.loc[filtered_stations['distance'].idxmax()]

print(f"The closest city to {city=} {country=} is {closest_city['name']} : station_id={closest_city['station_id']}")


The closest city to city='BORDEAUX' country='FR' is BORDEAUX-MERIGNAC : station_id=FR000007510




In [5]:
# Download the data from the closest city
station_id = closest_city['station_id']
today_YYYYMMDD=time.strftime("%Y%m%d")
local_file_name=f"{station_id}_{today_YYYYMMDD}.csv.gz"

url_by_station = f"https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_station/{station_id}.csv.gz"
url_all = f"https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/all/{station_id}.dly"

# Download the data from the station and write it to a local file
response = requests.get(url_by_station) 
with open(DATA_PATH+local_file_name, 'wb') as f:
    f.write(response.content)

In [6]:
# open the local file and load into a DataFrame
df = pd.read_csv(DATA_PATH+local_file_name, header=None, names=["station_id", "date", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"], parse_dates=["date"])
print(df[1000:1005])

       station_id       date element  value m_flag q_flag s_flag  obs_time
1000  FR000007510 1903-09-28    PRCP     40    NaN    NaN      I       NaN
1001  FR000007510 1903-09-29    PRCP      0    NaN    NaN      I       NaN
1002  FR000007510 1903-09-30    PRCP      0    NaN    NaN      I       NaN
1003  FR000007510 1903-10-01    PRCP     70    NaN    NaN      I       NaN
1004  FR000007510 1903-10-02    PRCP     30    NaN    NaN      I       NaN


SAME FOR PHUKET without useless code

In [2]:
# geocoding with geocode.maps.co
city="PHUKET"
country="TH"

url_geo="https://geocode.maps.co/search?city="+city+"&country="+country+"&api_key="+api_keys["GEOCODE_MAPS_CO"]

response = requests.get(url_geo)
# get first element in response
geo_data = response.json()[0]
# get display_name, lat, lon and importance
display_name = geo_data["display_name"]
lat = geo_data["lat"]
lon = geo_data["lon"]
importance = geo_data["importance"]

print(f"{display_name} : lat={lat}, lon={lon}, importance={importance}")
if importance<0.5:
    print("Warning: importance is low, the location may be incorrect")

radius="20000"

url_rech = "https://api.openaq.org/v2/locations?country="+country+"&coordinates="+lat+","+lon+"&radius="+radius+"&limit=20&page=1&offset=0&sort=asc&order_by=city"

headers = {"accept": "application/json"}

response = requests.get(url_rech, headers=headers)

json_file_path = DATA_PATH +city+ "_response.json"

# Write the response.text to the JSON file
with open(json_file_path, "w",encoding='utf-8') as file:
    # json.dump(response.text, file,ensure_ascii=False,indent=4)
    file.write(response.text)

response_json = response.json()
results = response_json["results"]

# create a DataFrame with interesting columns 
df_aq = pd.DataFrame(results, columns=["id", "city", "country", "firstUpdated", "lastUpdated", "measurements", "coordinates", "parameters"])
df_aq["latitude"] = df_aq["coordinates"].apply(lambda x: x["latitude"])
df_aq["longitude"] = df_aq["coordinates"].apply(lambda x: x["longitude"])
df_aq = df_aq.drop(columns=["coordinates"])

#  filter dataframe to get  the oldest firstUpdated and the biggest measurements
df_aq["firstUpdated"] = pd.to_datetime(df_aq["firstUpdated"])
df_aq = df_aq.sort_values(by=["firstUpdated","measurements"], ascending=[True,False],)
df_aq = df_aq.reset_index(drop=True)
# df_aq = df_aq.drop(df.index[1:])
print(df_aq.head())

openaq_key=api_keys["OPENAQ"]
location_id=str(df_aq["id"][0])
date_from="2024-04-01"
date_to="2024-04-30"

client = openaq.OpenAQ(api_key=openaq_key)
response = client.measurements.list(locations_id=location_id, date_from=date_from, date_to=date_to)
data_dict = response.dict()
df_aq_res = pd.json_normalize(data_dict['results'])

print(df_aq_res[10:15])

Phuket, Ratsada, Mueang Phuket, Phuket Province, 83000, Thailand : lat=7.8847901, lon=98.3891503, importance=0.6309396794654166
        id    city country              firstUpdated  \
0  1236044    None      TH 2024-06-13 21:00:00+00:00   
1  2827393    None      TH 2024-06-13 21:00:00+00:00   
2  2827392    None      TH 2024-06-13 21:00:00+00:00   
3   225605  Phuket      TH 2024-06-13 22:00:00+00:00   
4  2926351    None      TH 2024-07-01 12:00:00+00:00   

                 lastUpdated  measurements  \
0  2024-07-22T09:00:00+00:00          4602   
1  2024-07-22T09:00:00+00:00          2427   
2  2024-06-15T12:00:00+00:00           120   
3  2024-07-22T08:00:00+00:00          1227   
4  2024-07-22T09:00:00+00:00          2700   

                                          parameters  latitude  longitude  
0  [{'id': 100, 'unit': 'c', 'count': 767, 'avera...  8.028639  98.354412  
1  [{'id': 100, 'unit': 'c', 'count': 809, 'avera...  7.913188  98.385946  
2  [{'id': 100, 'unit': 'c', '

In [5]:

# TODO
# UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
#   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')

city="PHUKET"
country="TH"

file_station = "ghcnd-stations.txt"
df_stations = pd.read_fwf(DATA_PATH+file_station, header=None,widths=[11,9,10,7,4,31,4,4,5], names=["station_id", "latitude", "longitude", "elevation", "state", "name", "gsn_flag", "hcn_crn_flag", "wmo_id"])

# Filter df_stations by station_id starting with country
filtered_stations = df_stations[df_stations['station_id'].str.startswith(country)].copy()

# Calculate the Levenshtein distance between each city name and the country variable
filtered_stations['distance'] = filtered_stations['name'].apply(lambda x: fuzz.ratio(x, city))

# # Find the city name with the minimum distance
closest_city = filtered_stations.loc[filtered_stations['distance'].idxmax()]

print(f"The closest city to {city=} {country=} is {closest_city['name']} : station_id={closest_city['station_id']}")

# Download the data from the closest city
station_id = closest_city['station_id']
today_YYYYMMDD=time.strftime("%Y%m%d")
local_file_name=f"{station_id}_{today_YYYYMMDD}.csv.gz"

url_by_station = f"https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_station/{station_id}.csv.gz"
url_all = f"https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/all/{station_id}.dly"

# Download the data from the station and write it to a local file
response = requests.get(url_by_station) 
with open(DATA_PATH+local_file_name, 'wb') as f:
    f.write(response.content)

df = pd.read_csv(DATA_PATH+local_file_name, header=None, names=["station_id", "date", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"], parse_dates=["date"])
print(df[1000:1005])

The closest city to city='PHUKET' country='TH' is PHUKET : station_id=TH000048564


  df = pd.read_csv(DATA_PATH+local_file_name, header=None, names=["station_id", "date", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"], parse_dates=["date"])


       station_id       date element  value m_flag q_flag s_flag  obs_time
1000  TH000048564 1952-01-17    TMAX    325    NaN    NaN      I       NaN
1001  TH000048564 1952-01-18    TMAX    317    NaN    NaN      I       NaN
1002  TH000048564 1952-01-19    TMAX    325    NaN    NaN      I       NaN
1003  TH000048564 1952-01-20    TMAX    321    NaN    NaN      I       NaN
1004  TH000048564 1952-01-21    TMAX    305    NaN    NaN      I       NaN
