In [None]:
import pandas as pd
import numpy as np

# Download daily weather information from Meteostat

NOTE: please, insert the identifier of the desired station. Identifiers can be found at https://meteostat.net/.

EXAMPLE: if we want to download historical daily data concerning the city of Paris, we can access the station with ID 07156,
         which corresponds to the Paris-Montsouris weather station.

In [None]:
# These are the columns expected to be found in a Meteostat daily weather CSV:
#
# 1	date	The date string (format: YYYY-MM-DD)	String
# 2	tavg	The average air temperature in °C	Float
# 3	tmin	The minimum air temperature in °C	Float
# 4	tmax	The maximum air temperature in °C	Float
# 5	prcp	The daily precipitation total in mm	Float
# 6	snow	The maximum snow depth in mm	Integer
# 7	wdir	The average wind direction in degrees (°)	Integer
# 8	wspd	The average wind speed in km/h	Float
# 9	wpgt	The peak wind gust in km/h	Float
# 10 pres	The average sea-level air pressure in hPa	Float
# 11 tsun	The daily sunshine total in minutes (m)	Integer
#
# More info on the daily endpoint: https://dev.meteostat.net/bulk/daily.html#endpoints
# More info on the data formats used in the CSVs: https://dev.meteostat.net/formats.html
list_columns = ['date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun']


# Identifier of the station of interest.
dic_stations = {}
dic_stations["Paris Le Bourget"] = {'ID' : '07150', 'name' : 'Paris'}
dic_stations["Paris Charles de Gaulle"] = {'ID' : '07157', 'name' : 'Paris'}
dic_stations["New York John F. Kennedy Airport"] = {'ID': '74486', 'name': "New York"}


station = dic_stations["New York John F. Kennedy Airport"]
name_file_output = name_file_output = "weather_ny.parquet"

In [None]:
meteo = pd.read_csv(f"https://bulk.meteostat.net/v2/daily/{station['ID']}.csv.gz", names = list_columns)
display(meteo.info())
display(meteo)

### Remove weather records that occur before a given cutoff_date

In [None]:
cutoff_date = '1990-01-01'
meteo = meteo.loc[(meteo['date'] >= cutoff_date), :]
display(meteo)
display(meteo.info())

### Select the columns of interest

In [None]:
meteo = meteo.loc[:, ['date', 'tavg', 'prcp']] # Select the columns of interest.
meteo['tavg'] = meteo['tavg'].interpolate() # Interpolate the avg temperature when it's missing.
meteo['prcp'] = meteo['prcp'].fillna(0) # Assume it didn't rain when the precipitation value is missing.

display(meteo.info())
display(meteo)

### Determine the overall weather conditions based on the precipitation (in mm) that has fallen in a given day

In [None]:
meteo['conditions'] = 'violent rain'
meteo.loc[:, 'conditions'] = 'heavy rain'
meteo.loc[meteo['prcp'] < 7.6, 'conditions'] = 'moderate rain'
meteo.loc[meteo['prcp'] < 2.5, 'conditions'] = 'light rain'
meteo.loc[meteo['prcp'] == 0, 'conditions'] = 'sunny'

display(meteo['conditions'].value_counts())
display(meteo.info())
display(meteo)

### Prepare the processed dataframe for storage

In [None]:
meteo.drop(columns = 'prcp', inplace = True)
meteo.rename(columns = {'date' : 'DATE', 'tavg' : 'TAVG_C', 'conditions' : 'DESCRIPTION'}, inplace = True)
meteo.reset_index(drop = True, inplace = True)
display(meteo.info())
display(meteo)

meteo.to_parquet(name_file_output)