# API requests - Weather data collector

In [184]:
import numpy as np
import pandas as pd
import requests
import time
import os
import logging
from dotenv import load_dotenv # Use if you load your OpenWeatherMap API key from a .env file
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler
pd.options.display.max_columns = 100

## Goal and perimeter

Starting from the list of top 35 cities to visit in France given [here](https://one-week-in.com/35-cities-to-visit-in-france/), we'll collect weather data to identify a short-list of top 5 cities, and store it into a csv file on Amazon S3.

In [150]:
cities = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

## 1. Collect geolocalization data from [Nominatim/OpenStreetMap API](https://nominatim.org/)

### 1.1. Example for 1 single city

In [171]:
# Showing response json

city_0 = cities[0]
r = requests.get('https://nominatim.openstreetmap.org/search?format=json&q=France,'+city_0) # We add France in query to be sure to be specific enough
                                                                                          # and avoid ambiguity for cities with the same name (ex : Paris, France and Paris, Arkansas)
city_json_0 = r.json()[0] # We consider the first search result as the most relevant one
pprint(city_json_0)

{'boundingbox': ['48.6349172', '48.637031', '-1.5133292', '-1.5094796'],
 'class': 'tourism',
 'display_name': 'Mont Saint-Michel, Plateforme du Saut-\xadGaultier, Le '
                 'Mont-Saint-Michel, Avranches, Manche, Normandie, France '
                 'métropolitaine, 50170, France',
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_point_of_interest.p.20.png',
 'importance': 0.8654365567815739,
 'lat': '48.6359541',
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. '
            'https://osm.org/copyright',
 'lon': '-1.511459954959514',
 'osm_id': 211285890,
 'osm_type': 'way',
 'place_id': 156094680,
 'type': 'attraction'}


_Note : we used the free-form query `q=<query>` instead of the more specific query `city=<city>` because of some elements of our "cities" list that are not actually cities (like "Gorges du Verdon")_  

In [152]:
# Extracting relevant information

name = city_json_0['display_name']
lat = city_json_0['lat']
lon = city_json_0['lon']
print(f"{name} : \n -- latitude : {lat} \n -- longitude : {lon}")

Mont Saint-Michel, Plateforme du Saut-­Gaultier, Le Mont-Saint-Michel, Avranches, Manche, Normandie, France métropolitaine, 50170, France : 
 -- latitude : 48.6359541 
 -- longitude : -1.511459954959514


### 1.2. Generalization to the cities list

In [153]:
# Extracting response jsons

geolocs_info = []

print("Collecting localization data for ")
for city in cities:
    r = requests.get('https://nominatim.openstreetmap.org/search?format=json&q=France,'+city) #We add 'France' in the query to be more specific
    city_json = r.json()[0]
    city_json['city'] = city # Adding a key with the name used in the initial cities list
    print(city, end="...")
    geolocs_info.append(city_json)
    time.sleep(1) # wait 1sec between requests to respect Nominatim's Usage Policy (https://operations.osmfoundation.org/policies/nominatim/)
print("\nDone")

Collecting localization data for 
Mont Saint Michel...St Malo...Bayeux...Le Havre...Rouen...Paris...Amiens...Lille...Strasbourg...Chateau du Haut Koenigsbourg...Colmar...Eguisheim...Besancon...Dijon...Annecy...Grenoble...Lyon...Gorges du Verdon...Bormes les Mimosas...Cassis...Marseille...Aix en Provence...Avignon...Uzes...Nimes...Aigues Mortes...Saintes Maries de la mer...Collioure...Carcassonne...Ariege...Toulouse...Montauban...Biarritz...Bayonne...La Rochelle...
Done


In [170]:
pprint(geolocs_info[:3])

[{'boundingbox': ['48.6349172', '48.637031', '-1.5133292', '-1.5094796'],
  'city': 'Mont Saint Michel',
  'class': 'tourism',
  'display_name': 'Mont Saint-Michel, Plateforme du Saut-\xadGaultier, Le '
                  'Mont-Saint-Michel, Avranches, Manche, Normandie, France '
                  'métropolitaine, 50170, France',
  'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_point_of_interest.p.20.png',
  'importance': 0.8654365567815739,
  'lat': '48.6359541',
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. '
             'https://osm.org/copyright',
  'lon': '-1.511459954959514',
  'osm_id': 211285890,
  'osm_type': 'way',
  'place_id': 156094680,
  'type': 'attraction'},
 {'boundingbox': ['48.5979853', '48.6949736', '-2.0765246', '-1.9367259'],
  'city': 'St Malo',
  'class': 'boundary',
  'display_name': 'Saint-Malo, Ille-et-Vilaine, Bretagne, France '
                  'métropolitaine, 35400, France',
  'icon': 'https://nominatim.openstreetmap.org/ui/map

In [155]:
pd.DataFrame(geolocs_info).loc[:,['city','display_name']].T # checking for each city that the response corresponds to what expected

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
city,Mont Saint Michel,St Malo,Bayeux,Le Havre,Rouen,Paris,Amiens,Lille,Strasbourg,Chateau du Haut Koenigsbourg,Colmar,Eguisheim,Besancon,Dijon,Annecy,Grenoble,Lyon,Gorges du Verdon,Bormes les Mimosas,Cassis,Marseille,Aix en Provence,Avignon,Uzes,Nimes,Aigues Mortes,Saintes Maries de la mer,Collioure,Carcassonne,Ariege,Toulouse,Montauban,Biarritz,Bayonne,La Rochelle
display_name,"Mont Saint-Michel, Plateforme du Saut-­Gaultie...","Saint-Malo, Ille-et-Vilaine, Bretagne, France ...","Bayeux, Calvados, Normandie, France métropolit...","Le Havre, Seine-Maritime, Normandie, France mé...","Rouen, Seine-Maritime, Normandie, France métro...","Paris, Île-de-France, France métropolitaine, F...","Amiens, Somme, Hauts-de-France, France métropo...","Lille, Nord, Hauts-de-France, France métropoli...","Strasbourg, Bas-Rhin, Grand Est, France métrop...","Château du Haut-Kœnigsbourg, Chemin fermé suit...","Colmar, Colmar-Ribeauvillé, Haut-Rhin, Grand E...","Eguisheim, Colmar-Ribeauvillé, Haut-Rhin, Gran...","Besançon, Doubs, Bourgogne-Franche-Comté, Fran...","Dijon, Côte-d'Or, Bourgogne-Franche-Comté, Fra...","Annecy, Haute-Savoie, Auvergne-Rhône-Alpes, Fr...","Grenoble, Isère, Auvergne-Rhône-Alpes, France ...","Lyon, Métropole de Lyon, Circonscription dépar...","Gorges du Verdon, Route des Crêtes, Les Ferrai...","Bormes-les-Mimosas, Toulon, Var, Provence-Alpe...","Cassis, Marseille, Bouches-du-Rhône, Provence-...","Marseille, Bouches-du-Rhône, Provence-Alpes-Cô...","Aix-en-Provence, Bouches-du-Rhône, Provence-Al...","Avignon, Vaucluse, Provence-Alpes-Côte d'Azur,...","Uzès, Nîmes, Gard, Occitanie, France métropoli...","Nîmes, Gard, Occitanie, France métropolitaine,...","Aigues-Mortes, Nîmes, Gard, Occitanie, France ...","Saintes-Maries-de-la-Mer, Arles, Bouches-du-Rh...","Collioure, Céret, Pyrénées-Orientales, Occitan...","Carcassonne, Aude, Occitanie, France métropoli...","Ariège, Occitanie, France métropolitaine, France","Toulouse, Haute-Garonne, Occitanie, France mét...","Montauban, Tarn-et-Garonne, Occitanie, France ...","Biarritz, Bayonne, Pyrénées-Atlantiques, Nouve...","Bayonne, Pyrénées-Atlantiques, Nouvelle-Aquita...","La Rochelle, Charente-Maritime, Nouvelle-Aquit..."


In [156]:
# Formatting the data into a cleaned Pandas dataframe : 
geolocs_df = pd.DataFrame(geolocs_info).drop(columns = ['licence','osm_type','osm_id', 'boundingbox', 'importance', 'display_name', 'icon', 'class', 'type'])
geolocs_df = geolocs_df.rename(columns = {'place_id':'nominatim_place_id'}) # we will keep the nominatim place id as unique identifier for each city troughout the rest of this project
geolocs_df.insert(1, 'city', geolocs_df.pop('city'))
display(geolocs_df)

Unnamed: 0,nominatim_place_id,city,lat,lon
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514
1,297756747,St Malo,48.649518,-2.0260409
2,297981358,Bayeux,49.2764624,-0.7024738
3,298137491,Le Havre,49.4938975,0.1079732
4,297518815,Rouen,49.4404591,1.0939658
5,297417241,Paris,48.8588897,2.3200410217200766
6,297534793,Amiens,49.8941708,2.2956951
7,297472400,Lille,50.6365654,3.0635282
8,297508568,Strasbourg,48.584614,7.7507127
9,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195


In [157]:
place_id = 'nominatim_place_id' # storing the name of the cities' unique identifier column for later use as key for following dataframes

## 2. Collect weather data from [OpenWeatherMap API](https://openweathermap.org/api/one-call-api)

In [169]:
# Loading OpenWeatherMap API key (obtained after subscription) loaded in .env file - replace with your own key

load_dotenv() # set the environment variables from .env file
api_key = os.getenv('api_key') # API_key obtained after free subscription

# Showing response json 
 
units = 'metric'
lat_0, lon_0 = (geolocs_df[_][0] for _ in ['lat','lon'])
r = requests.get(f'https://api.openweathermap.org/data/2.5/onecall?lat={lat_0}&lon={lon_0}&appid={api_key}&units={units}')
weather_json_0 = r.json() # We consider the first search result as the most relevant one
weather_json_0[place_id]=geolocs_df[place_id][0] # Adding city unique identifier in json
print(weather_json_0.keys())
pprint(weather_json_0)

dict_keys(['lat', 'lon', 'timezone', 'timezone_offset', 'current', 'minutely', 'hourly', 'daily', 'nominatim_place_id'])
{'current': {'clouds': 53,
             'dew_point': 12.26,
             'dt': 1659629542,
             'feels_like': 21.05,
             'humidity': 56,
             'pressure': 1017,
             'sunrise': 1659588232,
             'sunset': 1659642023,
             'temp': 21.39,
             'uvi': 2.16,
             'visibility': 10000,
             'weather': [{'description': 'broken clouds',
                          'icon': '04d',
                          'id': 803,
                          'main': 'Clouds'}],
             'wind_deg': 319,
             'wind_gust': 7.04,
             'wind_speed': 6.71},
 'daily': [{'clouds': 15,
            'dew_point': 11.98,
            'dt': 1659614400,
            'feels_like': {'day': 21.63,
                           'eve': 19.93,
                           'morn': 17.41,
                           'night': 16.86},
 

### 2.1. Data exploration on 1 single city

We decide to focus on 'daily' informations to get a **one-week forecast** of weather informations (from the day D when we made the API request to the day D+7).  

In [159]:
print("Number of days for which weather data is collected :", len(weather_json_0['daily']))

Number of days for which weather data is collected : 8


In [160]:
# Testing with the 'current' datetime (corresponding to the time at which we requested the OpenWeatherMap API)

# The dates ('dt') are in UNIX (=POSIX) format (number of seconds since 1st January 1970).
# To convert it to datetime format we can use datetime.fromtimestamp method. 
# Note : the times are given in UTC timezone
current_datetime_example = datetime.fromtimestamp(weather_json_0['current']['dt'])
first_datetime_example = datetime.fromtimestamp(weather_json_0['daily'][0]['dt'])
last_datetime_example = datetime.fromtimestamp(weather_json_0['daily'][-1]['dt'])
print("Datetime of the API request: ", current_datetime_example)
print("First datetime covered for daily info: ", first_datetime_example)
print("Last datetime covered for daily info: ", last_datetime_example)

Datetime of the API request:  2022-08-04 17:49:48
First datetime covered for daily info:  2022-08-04 14:00:00
Last datetime covered for daily info:  2022-08-11 14:00:00


In [161]:
print("Daily informations available : \n", list(weather_json_0['daily'][0].keys()))

Daily informations available : 
 ['dt', 'sunrise', 'sunset', 'moonrise', 'moonset', 'moon_phase', 'temp', 'feels_like', 'pressure', 'humidity', 'dew_point', 'wind_speed', 'wind_deg', 'wind_gust', 'weather', 'clouds', 'pop', 'uvi']


To get a ranking of the cities with the most pleasant weather in the upcoming days, we choose to focus on these criteria : 
- lowest difference (absolute) between mean of the perceived temperature (`'feels_like.day'`) and an ideal temperature we choose
- lowest mean percentage of precipitations (`'pop'`)

In [162]:
# Storing filtered weather data in a Dataframe

criteria = {
    'feels_like.day': np.mean,
    'pop': np.mean
} # dictionnary associating each criteria with its aggregation function to apply along days when grouping by city
keys_to_keep = [place_id] + list(criteria.keys())
weather_df_0 = pd.json_normalize(weather_json_0, record_path = ['daily'], meta = [place_id]).loc[:,keys_to_keep]
weather_df_0

Unnamed: 0,nominatim_place_id,feels_like.day,pop
0,156094680,21.63,0.02
1,156094680,21.84,0.0
2,156094680,25.02,0.0
3,156094680,26.16,0.0
4,156094680,27.95,0.0
5,156094680,28.38,0.0
6,156094680,30.27,0.0
7,156094680,31.39,0.0


In [163]:
# aggregating data to obtain one single observation by city

weather_df_0 = weather_df_0.groupby(place_id).agg(criteria)
weather_df_0

Unnamed: 0_level_0,feels_like.day,pop
nominatim_place_id,Unnamed: 1_level_1,Unnamed: 2_level_1
156094680,26.58,0.0025


In [164]:
# summarizing localization and weather info in one dataframe

city_weather_df_0 = geolocs_df.merge(weather_df_0, on = [place_id])
city_weather_df_0

Unnamed: 0,nominatim_place_id,city,lat,lon,feels_like.day,pop
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514,26.58,0.0025


### 2.2. Aggregate data for all cities

We're going to generalize the previous steps to obtain a unique dataframe summarizing localization and weather data we need to build our map of top 5 cities. 

In [185]:
city_jsons = []
for i, city  in enumerate(cities):    
    lat, lon = (geolocs_df[_][i] for _ in ['lat','lon'])
    r = requests.get(f'https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&appid={api_key}&units={units}')
    weather_json = r.json() # We consider the first search result as the most relevant one
    weather_json[place_id]=geolocs_df[place_id][i] # Adding city unique identifier in json
    city_jsons.append(weather_json)
    
weather_df = pd.json_normalize(city_jsons, record_path = ['daily'], meta = [place_id]).loc[:,keys_to_keep]
weather_df = weather_df.groupby(place_id).agg(criteria)


## 3. Final processings and loading to S3

We know process the dataframe in order to set an automatical ranking from the criteria chosen in the previous part.
**To take into account the temperature criteria as well as the rain criteria, with an equal importance between the 2, we will scale these data and combine it to create a score to minimize.** 

In [190]:
IDEAL_TEMPERATURE = 25 # setting ideal temperature criteria

# Joining localization and weather data

summary_df = geolocs_df.merge(weather_df, on = place_id)

# Basic processings to make data more intelligible

summary_df['feels_like.day'] = np.round(summary_df['feels_like.day']).astype(int)
summary_df['pop'] = np.round(summary_df['pop']*100).astype(int)

# Construction of the score to base the ranking on

summary_df['temp_delta'] = np.abs(summary_df['feels_like.day'] - IDEAL_TEMPERATURE) # Difference (absolute) between felt temperature and ideal temperature

score_df = pd.DataFrame(
    MinMaxScaler().fit_transform(summary_df.iloc[:,len(summary_df.columns)-len(criteria):]),
    index = summary_df.index,
    columns = ['pop_scaled', 'temp_delta_scaled']
)
score_df['score_to_minimize'] = score_df['pop_scaled'] + score_df['temp_delta_scaled']

summary_df = pd.concat([summary_df, score_df], axis = 1).sort_values('score_to_minimize', ascending = True).reset_index(drop = 'True')
summary_df['ranking'] = summary_df.index + 1
summary_df


Unnamed: 0,nominatim_place_id,city,lat,lon,feels_like.day,pop,temp_delta,pop_scaled,temp_delta_scaled,score_to_minimize,ranking
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514,27,0,2,0.0,0.125,0.125,1
1,297756747,St Malo,48.649518,-2.0260409,23,0,2,0.0,0.125,0.125,2
2,297981358,Bayeux,49.2764624,-0.7024738,23,0,2,0.0,0.125,0.125,3
3,298137491,Le Havre,49.4938975,0.1079732,23,1,2,0.016129,0.125,0.141129,4
4,297472400,Lille,50.6365654,3.0635282,26,9,1,0.145161,0.0,0.145161,5
5,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195,27,10,2,0.16129,0.125,0.28629,6
6,297534793,Amiens,49.8941708,2.2956951,27,13,2,0.209677,0.125,0.334677,7
7,297518815,Rouen,49.4404591,1.0939658,28,6,3,0.096774,0.25,0.346774,8
8,297653650,La Rochelle,46.1591126,-1.1520434,28,13,3,0.209677,0.25,0.459677,9
9,298011281,Saintes Maries de la mer,43.4522771,4.4287172,29,9,4,0.145161,0.375,0.520161,10


In [192]:
summary_df.to_csv('weather_data.csv', index = False) # storing the data in a csv