# Weather data collector

In [1]:
import numpy as np
import pandas as pd
import requests
import time
import os
import logging
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler

# Use it if you load your OpenWeatherMap API key from a .env file :
from dotenv import load_dotenv
load_dotenv() # set the environment variables from .env file

pd.options.display.max_columns = 100

## Goal and perimeter

**Starting from the list of top 35 cities to visit in France given [here](https://one-week-in.com/35-cities-to-visit-in-france/), we'll collect weather data to obtain a ranking of these cities, and store it into a csv file for later use in hotel recommendations.**

In [2]:
cities = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

## 1. Collect geolocalization data from [Nominatim/OpenStreetMap API](https://nominatim.org/)

_Note: we used the free-form query `q=<query>` instead of the more specific query `city=<city>` because of some elements of our "cities" list that are not actually cities (like "Gorges du Verdon")_  

### 1.2. Generalization to the cities list

In [3]:
# Collect geoloc jsons

geoloc_jsons = []

print("Collecting localization data for ")
for city in cities:
    r = requests.get('https://nominatim.openstreetmap.org/search?format=json&q=France,'+city) # We add 'France' in query to be sure to be specific enough 
                                                                                              # and avoid ambiguity for cities with the same name 
                                                                                              # (ex: Paris, France and Paris, Arkansas)
    city_geoloc_json = r.json()[0] # We consider the first search result as the most relevant one
    city_geoloc_json['city'] = city # Adding a key with the name used in the initial cities list
    geoloc_jsons.append(city_geoloc_json)
    print(city, end="...")
    time.sleep(1) # wait 1sec between requests to respect Nominatim's Usage Policy (https://operations.osmfoundation.org/policies/nominatim/)
print()
print(f"Successfully collected localization data for {len(geoloc_jsons)} cities")

Collecting localization data for 
Mont Saint Michel...St Malo...Bayeux...Le Havre...Rouen...Paris...Amiens...Lille...Strasbourg...Chateau du Haut Koenigsbourg...Colmar...Eguisheim...Besancon...Dijon...Annecy...Grenoble...Lyon...Gorges du Verdon...Bormes les Mimosas...Cassis...Marseille...Aix en Provence...Avignon...Uzes...Nimes...Aigues Mortes...Saintes Maries de la mer...Collioure...Carcassonne...Ariege...Toulouse...Montauban...Biarritz...Bayonne...La Rochelle...
Successfully collected localization data for 35 cities


In [4]:
pprint(geoloc_jsons[:3])

[{'boundingbox': ['48.6349172', '48.637031', '-1.5133292', '-1.5094796'],
  'city': 'Mont Saint Michel',
  'class': 'tourism',
  'display_name': 'Mont Saint-Michel, Plateforme du Saut-\xadGaultier, Le '
                  'Mont-Saint-Michel, Avranches, Manche, Normandie, France '
                  'métropolitaine, 50170, France',
  'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_point_of_interest.p.20.png',
  'importance': 0.8654365567815739,
  'lat': '48.6359541',
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. '
             'https://osm.org/copyright',
  'lon': '-1.511459954959514',
  'osm_id': 211285890,
  'osm_type': 'way',
  'place_id': 156094680,
  'type': 'attraction'},
 {'boundingbox': ['48.5979853', '48.6949736', '-2.0765246', '-1.9367259'],
  'city': 'St Malo',
  'class': 'boundary',
  'display_name': 'Saint-Malo, Ille-et-Vilaine, Bretagne, France '
                  'métropolitaine, 35400, France',
  'icon': 'https://nominatim.openstreetmap.org/ui/map

In [5]:
pd.DataFrame(geoloc_jsons).loc[:,['city','display_name']].T # check for each city that the response corresponds to what expected

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
city,Mont Saint Michel,St Malo,Bayeux,Le Havre,Rouen,Paris,Amiens,Lille,Strasbourg,Chateau du Haut Koenigsbourg,Colmar,Eguisheim,Besancon,Dijon,Annecy,Grenoble,Lyon,Gorges du Verdon,Bormes les Mimosas,Cassis,Marseille,Aix en Provence,Avignon,Uzes,Nimes,Aigues Mortes,Saintes Maries de la mer,Collioure,Carcassonne,Ariege,Toulouse,Montauban,Biarritz,Bayonne,La Rochelle
display_name,"Mont Saint-Michel, Plateforme du Saut-­Gaultie...","Saint-Malo, Ille-et-Vilaine, Bretagne, France ...","Bayeux, Calvados, Normandie, France métropolit...","Le Havre, Seine-Maritime, Normandie, France mé...","Rouen, Seine-Maritime, Normandie, France métro...","Paris, Île-de-France, France métropolitaine, F...","Amiens, Somme, Hauts-de-France, France métropo...","Lille, Nord, Hauts-de-France, France métropoli...","Strasbourg, Bas-Rhin, Grand Est, France métrop...","Château du Haut-Kœnigsbourg, Chemin fermé suit...","Colmar, Colmar-Ribeauvillé, Haut-Rhin, Grand E...","Eguisheim, Colmar-Ribeauvillé, Haut-Rhin, Gran...","Besançon, Doubs, Bourgogne-Franche-Comté, Fran...","Dijon, Côte-d'Or, Bourgogne-Franche-Comté, Fra...","Annecy, Haute-Savoie, Auvergne-Rhône-Alpes, Fr...","Grenoble, Isère, Auvergne-Rhône-Alpes, France ...","Lyon, Métropole de Lyon, Circonscription dépar...","Gorges du Verdon, Route des Crêtes, Les Ferrai...","Bormes-les-Mimosas, Toulon, Var, Provence-Alpe...","Cassis, Marseille, Bouches-du-Rhône, Provence-...","Marseille, Bouches-du-Rhône, Provence-Alpes-Cô...","Aix-en-Provence, Bouches-du-Rhône, Provence-Al...","Avignon, Vaucluse, Provence-Alpes-Côte d'Azur,...","Uzès, Nîmes, Gard, Occitanie, France métropoli...","Nîmes, Gard, Occitanie, France métropolitaine,...","Aigues-Mortes, Nîmes, Gard, Occitanie, France ...","Saintes-Maries-de-la-Mer, Arles, Bouches-du-Rh...","Collioure, Céret, Pyrénées-Orientales, Occitan...","Carcassonne, Aude, Occitanie, France métropoli...","Ariège, Occitanie, France métropolitaine, France","Toulouse, Haute-Garonne, Occitanie, France mét...","Montauban, Tarn-et-Garonne, Occitanie, France ...","Biarritz, Bayonne, Pyrénées-Atlantiques, Nouve...","Bayonne, Pyrénées-Atlantiques, Nouvelle-Aquita...","La Rochelle, Charente-Maritime, Nouvelle-Aquit..."


In [6]:
# Format the data into a cleaned Pandas dataframe

geoloc_df = pd.DataFrame(geoloc_jsons).drop(columns = ['licence','osm_type','osm_id', 'boundingbox', 'importance', 'display_name', 'icon', 'class', 'type']) # keep only relevant info
geoloc_df = geoloc_df.rename(columns = {'place_id':'nominatim_place_id'}) # we will keep the nominatim place id as unique identifier for each city troughout the rest of this project
geoloc_df.insert(1, 'city', geoloc_df.pop('city'))
display(geoloc_df)

Unnamed: 0,nominatim_place_id,city,lat,lon
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514
1,297756747,St Malo,48.649518,-2.0260409
2,297981358,Bayeux,49.2764624,-0.7024738
3,298137491,Le Havre,49.4938975,0.1079732
4,297518815,Rouen,49.4404591,1.0939658
5,297417241,Paris,48.8588897,2.3200410217200766
6,297534793,Amiens,49.8941708,2.2956951
7,297472400,Lille,50.6365654,3.0635282
8,297508568,Strasbourg,48.584614,7.7507127
9,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195


In [7]:
place_identifier = 'nominatim_place_id' # store the name of the cities' unique identifier column for later use as key for following dataframes

## 2. Collect weather data from [OpenWeatherMap API](https://openweathermap.org/api/one-call-api)

In [8]:
# Get OpenWeatherMap API key loaded in the environment variables

API_KEY = os.getenv('OWM_API_KEY') # <- replace with your own key (obtained after free subscription)

# Collect jsons

weather_jsons = []

UNITS = 'metric'
print("Collecting localization data for ")
for i, city  in enumerate(cities):    
    lat, lon = (geoloc_df[_][i] for _ in ['lat','lon'])
    r = requests.get(f'https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&appid={API_KEY}&units={UNITS}')
    city_weather_json = r.json()
    city_weather_json[place_identifier]=geoloc_df[place_identifier][i] # Add city unique identifier in json
    weather_jsons.append(city_weather_json)
    print(city, end="...")
print()
print(f"Successfully collected weather data for {len(weather_jsons)} cities")

Collecting localization data for 
Mont Saint Michel...St Malo...Bayeux...Le Havre...Rouen...Paris...Amiens...Lille...Strasbourg...Chateau du Haut Koenigsbourg...Colmar...Eguisheim...Besancon...Dijon...Annecy...Grenoble...Lyon...Gorges du Verdon...Bormes les Mimosas...Cassis...Marseille...Aix en Provence...Avignon...Uzes...Nimes...Aigues Mortes...Saintes Maries de la mer...Collioure...Carcassonne...Ariege...Toulouse...Montauban...Biarritz...Bayonne...La Rochelle...
Successfully collected weather data for 35 cities


### 2.1. Data structure exploration and ranking criteria definition

**Each city weather json has quite nested data => we will first explore the data structure on one city sample to find the criteria to base our ranking on.**

In [9]:
weather_json_0 = weather_jsons[0]
print(weather_json_0.keys())
pprint(weather_json_0)

dict_keys(['lat', 'lon', 'timezone', 'timezone_offset', 'current', 'minutely', 'hourly', 'daily', 'nominatim_place_id'])
{'current': {'clouds': 2,
             'dew_point': 1.97,
             'dt': 1659784853,
             'feels_like': 21.36,
             'humidity': 26,
             'pressure': 1026,
             'sunrise': 1659761196,
             'sunset': 1659814639,
             'temp': 22.39,
             'uvi': 6.02,
             'visibility': 10000,
             'weather': [{'description': 'clear sky',
                          'icon': '01d',
                          'id': 800,
                          'main': 'Clear'}],
             'wind_deg': 43,
             'wind_gust': 5.3,
             'wind_speed': 4.08},
 'daily': [{'clouds': 4,
            'dew_point': 2.44,
            'dt': 1659787200,
            'feels_like': {'day': 21.98,
                           'eve': 22.6,
                           'morn': 12.48,
                           'night': 14.79},
            '

We decide to focus on 'daily' informations to get a **one-week forecast** of weather informations (from the day D when we made the API request to the day D+7).  

In [10]:
print("Number of days for which weather data is collected:", len(weather_json_0['daily']))

Number of days for which weather data is collected: 8


In [11]:
# Test with the 'current' datetime (corresponding to the time at which we requested the OpenWeatherMap API)

# The dates ('dt') are in UNIX (=POSIX) format (number of seconds since 1st January 1970).
# To convert it to datetime format we can use datetime.fromtimestamp method. 
# Note: the times are given in UTC timezone
current_datetime_example = datetime.fromtimestamp(weather_json_0['current']['dt'])
first_datetime_example = datetime.fromtimestamp(weather_json_0['daily'][0]['dt'])
last_datetime_example = datetime.fromtimestamp(weather_json_0['daily'][-1]['dt'])
print("Datetime of the API request: ", current_datetime_example)
print("First datetime covered for daily info: ", first_datetime_example)
print("Last datetime covered for daily info: ", last_datetime_example)

Datetime of the API request:  2022-08-06 13:20:53
First datetime covered for daily info:  2022-08-06 14:00:00
Last datetime covered for daily info:  2022-08-13 14:00:00


In [12]:
print("Daily informations available: \n", list(weather_json_0['daily'][0].keys()))

Daily informations available: 
 ['dt', 'sunrise', 'sunset', 'moonrise', 'moonset', 'moon_phase', 'temp', 'feels_like', 'pressure', 'humidity', 'dew_point', 'wind_speed', 'wind_deg', 'wind_gust', 'weather', 'clouds', 'pop', 'uvi']


**To get a ranking of the cities with the most pleasant weather in the upcoming days, we choose to focus on these criteria:** 
- **lowest difference (absolute) between mean of the perceived temperature (`'feels_like.day'`) and an ideal temperature we choose later**
- **lowest mean percentage of precipitations (`'pop'`)**

In [13]:
# Store filtered weather data in a Dataframe

criteria = {
    'feels_like.day': np.mean,
    'pop': np.mean
} # dictionnary associating each criteria with its aggregation function to apply along days when grouping by city
keys_to_keep = [place_identifier] + list(criteria.keys())

weather_df_0 = pd.json_normalize(weather_json_0, record_path = ['daily'], meta = [place_identifier]).loc[:,keys_to_keep]
weather_df_0

Unnamed: 0,nominatim_place_id,feels_like.day,pop
0,156094680,21.98,0
1,156094680,26.34,0
2,156094680,27.51,0
3,156094680,28.72,0
4,156094680,29.57,0
5,156094680,31.74,0
6,156094680,34.59,0
7,156094680,35.63,0


In [14]:
# aggregate data to obtain one single observation by city

weather_df_0 = weather_df_0.groupby(place_identifier).agg(criteria)
weather_df_0

Unnamed: 0_level_0,feels_like.day,pop
nominatim_place_id,Unnamed: 1_level_1,Unnamed: 2_level_1
156094680,29.51,0.0


In [15]:
# summarize localization and weather info in one dataframe

summary_df_0 = geoloc_df.merge(weather_df_0, on = [place_identifier])
summary_df_0

Unnamed: 0,nominatim_place_id,city,lat,lon,feels_like.day,pop
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514,29.51,0.0


### 2.2 Aggregate data for all cities 

In [16]:
# Generalize the previous steps to obtain a unique dataframe summarizing localization and weather data we need to build our cities ranking

weather_df = pd.json_normalize(weather_jsons, record_path = ['daily'], meta = [place_identifier]).loc[:,keys_to_keep]
weather_df = weather_df.groupby(place_identifier).agg(criteria)

weather_df

Unnamed: 0_level_0,feels_like.day,pop
nominatim_place_id,Unnamed: 1_level_1,Unnamed: 2_level_1
76036307,28.42125,0.24125
120791766,26.0825,0.0525
156094680,29.51,0.0
297389050,26.71875,0.33125
297417241,29.29,0.0
297466626,33.87875,0.1475
297472400,27.8275,0.0
297478568,30.42875,0.00625
297504747,29.185,0.07
297508568,29.795,0.03625


## 3. Final processings and data storage

We now process the dataframe in order to set an automatical ranking from the criteria chosen in the previous part.  
**To take into account the temperature criteria as well as the rain criteria, with an equal importance between the 2, we will scale these data and compute a weather score between 0 and 100 (the bigger the better the weather is)** 

In [17]:
IDEAL_TEMPERATURE = 25 # set ideal temperature criteria

# Merge localization and weather data

summary_df = geoloc_df.merge(weather_df, on = place_identifier)

# Basic processings to make data more intelligible

summary_df['feels_like.day'] = np.round(summary_df['feels_like.day']).astype(int)
summary_df['pop'] = np.round(summary_df['pop']*100).astype(int)

# Construction of the score to base the ranking on

summary_df['temp_delta'] = np.abs(summary_df['feels_like.day'] - IDEAL_TEMPERATURE) # Difference (absolute) between felt temperature and ideal temperature

scaled_criteria_df = pd.DataFrame(
    MinMaxScaler().fit_transform(summary_df.iloc[:,len(summary_df.columns)-len(criteria):]),
    index = summary_df.index,
    columns = ['pop_scaled', 'temp_delta_scaled']
)

combined_criteria = scaled_criteria_df['pop_scaled'] + scaled_criteria_df['temp_delta_scaled'] # combining the 2 criteria to minimize
weather_score = np.round((1 - MinMaxScaler().fit_transform(combined_criteria.values.reshape(-1, 1))) * 100,1) # computing a score between 0 and 100 (the bigger the better the weather is)

summary_df['weather_score'] = weather_score
summary_df = summary_df.sort_values('weather_score', ascending = False).reset_index(drop = 'True')
summary_df['ranking'] = summary_df.index + 1
summary_df

Unnamed: 0,nominatim_place_id,city,lat,lon,feels_like.day,pop,temp_delta,weather_score,ranking
0,297981358,Bayeux,49.2764624,-0.7024738,25,0,0,100.0,1
1,298137491,Le Havre,49.4938975,0.1079732,25,0,0,100.0,2
2,297756747,St Malo,48.649518,-2.0260409,26,0,1,92.4,3
3,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195,26,5,1,81.9,4
4,297653650,La Rochelle,46.1591126,-1.1520434,28,0,3,77.1,5
5,297472400,Lille,50.6365654,3.0635282,28,0,3,77.1,6
6,297417241,Paris,48.8588897,2.3200410217200766,29,0,4,69.4,7
7,297534793,Amiens,49.8941708,2.2956951,29,0,4,69.4,8
8,298516909,Biarritz,43.47114375,-1.552726590666314,27,9,2,66.0,9
9,297668227,Besancon,47.2380222,6.0243622,29,2,4,65.3,10


In [18]:
# Saving csv file in 'data' directory in local
dir_name = 'data'
file_name = 'weather_data.csv'
file_path = f"{dir_name}/{file_name}"
if dir_name not in os.listdir(): # Create 'data' directory if not already existing
        os.mkdir(dir_name)
summary_df.to_csv(f"{dir_name}/{file_name}", index = False) # storing the data in a csv in local folder to re-use it in the hotels info collector notebook