# Open Data: extraction and transformation

In [2]:
import requests
import requests_cache
from retry_requests import retry
import pandas as pd
from datetime import datetime
pd.options.display.max_rows = 100
pd.options.display.max_columns = None

In [3]:
path = "OPEN_DATA_PATH"

## Weather data from Open-Meteo

https://open-meteo.com

https://pypi.org/project/openmeteo-requests/

In [3]:
import openmeteo_requests

In [4]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

Define the latitude and longitude provided by Habitat

In [5]:
latitude = 40.7
longitude = -3.5

In [6]:
def get_weather_df(url, params):
    
    responses = openmeteo.weather_api(url, params=params)
    
    response = responses[0]
    print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    
    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_weather_code = daily.Variables(0).ValuesAsNumpy()
    daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
    daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(3).ValuesAsNumpy()
    
    daily_data = {"date": pd.date_range(
    	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
    	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
    	freq = pd.Timedelta(seconds = daily.Interval()),
    	inclusive = "left"
    ), "weather_code": daily_weather_code, "temperature_2m_max": daily_temperature_2m_max,
        "temperature_2m_min": daily_temperature_2m_min, "precipitation_sum": daily_precipitation_sum}
    
    df = pd.DataFrame(data = daily_data)
    return df


### Past forecast weather data

https://open-meteo.com/en/docs/historical-forecast-api

In [7]:
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"

# define desired interval of days
start_date = "2022-01-01"
end_date = datetime.now().strftime('%Y-%m-%d')

params = {
	"latitude": latitude,
	"longitude": longitude,
	"start_date": start_date,
	"end_date": end_date,
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "precipitation_sum"]
}

past_df = get_weather_df(url,params)

past_df

Coordinates 40.70000076293945°N -3.5°E


Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,precipitation_sum
0,2022-01-01 00:00:00+00:00,3.0,15.602500,6.5025,0.0
1,2022-01-02 00:00:00+00:00,3.0,14.302500,6.8025,0.0
2,2022-01-03 00:00:00+00:00,3.0,13.102500,5.6525,0.0
3,2022-01-04 00:00:00+00:00,80.0,11.752500,4.6525,2.7
4,2022-01-05 00:00:00+00:00,51.0,7.952500,3.0025,0.6
...,...,...,...,...,...
838,2024-04-18 00:00:00+00:00,3.0,20.775999,6.5760,0.0
839,2024-04-19 00:00:00+00:00,3.0,20.775999,4.5260,0.0
840,2024-04-20 00:00:00+00:00,3.0,23.375999,7.8760,0.0
841,2024-04-21 00:00:00+00:00,3.0,21.825998,8.4260,0.0


##### Data transformation

In [8]:
# Change date format
past_df['date'] = pd.to_datetime(past_df['date']).dt.strftime('%Y-%m-%d')
# Save results
past_df.to_csv(f"{path}pastWeather{start_date.replace('-','')}_{end_date.replace('-','')}.csv", index=False)

### Future forecast weather data

https://open-meteo.com/en/docs/

In [9]:
url = "https://api.open-meteo.com/v1/forecast"

params = {
    "latitude": latitude,
    "longitude": longitude,
    "daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
    "past_days": 0,
    "forecast_days": 16
}

forecast_df = get_weather_df(url,params)

forecast_df

Coordinates 40.70000076293945°N -3.5°E


Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,precipitation_sum
0,2024-04-22 00:00:00+00:00,3.0,19.675999,6.676,0.0
1,2024-04-23 00:00:00+00:00,3.0,17.075998,2.526,0.0
2,2024-04-24 00:00:00+00:00,3.0,20.58,2.976,0.0
3,2024-04-25 00:00:00+00:00,51.0,15.28,9.68,0.8
4,2024-04-26 00:00:00+00:00,3.0,16.068001,5.368,0.0
5,2024-04-27 00:00:00+00:00,80.0,12.818001,6.918,4.5
6,2024-04-28 00:00:00+00:00,3.0,14.218,3.268,0.0
7,2024-04-29 00:00:00+00:00,3.0,16.602499,2.468,0.0
8,2024-04-30 00:00:00+00:00,1.0,18.8025,5.0525,0.0
9,2024-05-01 00:00:00+00:00,3.0,21.602499,5.9025,0.0


##### Data transformation

In [10]:
# Change date format
forecast_df['date'] = pd.to_datetime(forecast_df['date']).dt.strftime('%Y-%m-%d')
# Save results
forecast_df.to_csv(f"{path}forecastWeather{datetime.now().strftime('%Y%m%d')}.csv", index=False)

## Holiday Calendar from Comunidad de Madrid

https://datos.comunidad.madrid/catalogo/?q=festivos&sort=score+desc%2C+metadata_modified+desc

#### Transformation of Regional data

In [6]:
loc = pd.read_csv(path+"festivos_locales_historicos.csv", sep=';', encoding = 'latin-1')
loc

Unnamed: 0,año,municipio_codigo,municipio_nombre,entidad_codigo,entidad_nombre,fecha_festivo
0,1998,2,Ajalvir,0,Ajalvir,1998-02-03
1,1998,2,Ajalvir,0,Ajalvir,1998-05-15
2,1998,3,Alameda del Valle,0,Alameda del Valle,1998-02-02
3,1998,3,Alameda del Valle,0,Alameda del Valle,1998-09-08
4,1998,5,Alcalá de Henares,0,Alcalá de Henares,1998-08-06
...,...,...,...,...,...,...
9383,2023,181,Villaviciosa de Odón,0,Villaviciosa de Odón,2023-09-18
9384,2023,182,Villavieja del Lozoya,0,Villavieja del Lozoya,2023-08-21
9385,2023,182,Villavieja del Lozoya,0,Villavieja del Lozoya,2023-12-07
9386,2023,183,Zarzalejo,0,Zarzalejo,2023-06-29


In [4]:
reg = pd.read_csv(path+"festivos_regionales_historicos.csv", sep=';', encoding = 'latin-1')
reg24 = pd.read_csv(path+"festivos_regionales.csv", sep=';', encoding = 'latin-1')

# Keep only regional data from 2020 and ahead
reg = reg[reg['año'] >= 2022]
reg

Unnamed: 0,año,fecha_festivo,festividad
309,2022,2022-01-01,Año Nuevo
310,2022,2022-01-06,Epifanía del Señor
311,2022,2022-04-14,Jueves Santo
312,2022,2022-04-15,Viernes Santo
313,2022,2022-05-02,Fiesta Comunidad de Madrid
314,2022,2022-07-25,Santiago Apóstol
315,2022,2022-08-15,Asunción de la Virgen
316,2022,2022-10-12,Fiesta Nacional de España
317,2022,2022-11-01,Todos los Santos
318,2022,2022-12-06,Día de la Constitución Española


#### Transformation of Municipal data

In [12]:
loc = pd.read_csv(path+"festivos_locales_historicos.csv", sep=';', encoding = 'latin-1')
loc24 = pd.read_csv(path+"festivos_locales.csv", sep=';', encoding = 'latin-1')

# Keep only local data from Madrid and from 2020 or ahead
loc = loc[(loc['año'] >= 2020) & (loc['municipio_nombre'] == 'Madrid')]
loc24 = loc24[loc24['municipio_nombre'] == 'Madrid']
loc

Unnamed: 0,año,municipio_codigo,municipio_nombre,entidad_codigo,entidad_nombre,fecha_festivo
8142,2020,79,Madrid,0,Madrid,2020-05-15
8143,2020,79,Madrid,0,Madrid,2020-11-09
8494,2021,79,Madrid,0,Madrid,2021-05-15
8495,2021,79,Madrid,0,Madrid,2021-11-09
8846,2022,79,Madrid,0,Madrid,2022-05-16
8847,2022,79,Madrid,0,Madrid,2022-11-09
9200,2023,79,Madrid,0,Madrid,2023-05-15
9201,2023,79,Madrid,0,Madrid,2023-11-09


#### Merge holiday calendar data

In [24]:
all_holiday = pd.concat([reg,reg24,loc,loc24], ignore_index=True)

# create date id
all_holiday['holiday_id'] = all_holiday['fecha_festivo'].apply(lambda x: int(x.replace('-', '')))
# all_holiday.to_csv(f"{path}AllHoliday_df.csv", index=False)
all_holiday.sort_values(by='fecha_festivo').reset_index(drop=True)

Unnamed: 0,año,fecha_festivo,festividad,municipio_codigo,municipio_nombre,entidad_codigo,entidad_nombre,holiday_id
0,2020,2020-01-01,Año Nuevo,,,,,20200101
1,2020,2020-01-06,Epifanía del Señor,,,,,20200106
2,2020,2020-04-09,Jueves Santo,,,,,20200409
3,2020,2020-04-10,Viernes Santo,,,,,20200410
4,2020,2020-05-01,Fiesta del Trabajo,,,,,20200501
5,2020,2020-05-02,Fiesta de la Comunidad de Madrid,,,,,20200502
6,2020,2020-05-15,,79.0,Madrid,0.0,Madrid,20200515
7,2020,2020-08-15,Asunción de la Virgen,,,,,20200815
8,2020,2020-10-12,Fiesta Nacional de España,,,,,20201012
9,2020,2020-11-02,traslado Todos los Santos,,,,,20201102


## Calendar dataset

In [34]:
import datetime as dt
calendar_df = pd.DataFrame({"date": pd.date_range('2022-01-01', '2024-12-31')})
calendar_df["day_month"] = calendar_df.date.dt.day
calendar_df["month"] = calendar_df.date.dt.month
calendar_df["year"] = calendar_df.date.dt.year
calendar_df["day_week"] = calendar_df.date.dt.day_name()
calendar_df["id_date"] = calendar_df.date.dt.strftime('%Y%m%d').astype('int')
calendar_df["date"] = calendar_df["date"].dt.strftime('%Y-%m-%d')
calendar_df

Unnamed: 0,date,day_month,month,year,day_week,id_date
0,2022-01-01,1,1,2022,Saturday,20220101
1,2022-01-02,2,1,2022,Sunday,20220102
2,2022-01-03,3,1,2022,Monday,20220103
3,2022-01-04,4,1,2022,Tuesday,20220104
4,2022-01-05,5,1,2022,Wednesday,20220105
...,...,...,...,...,...,...
1091,2024-12-27,27,12,2024,Friday,20241227
1092,2024-12-28,28,12,2024,Saturday,20241228
1093,2024-12-29,29,12,2024,Sunday,20241229
1094,2024-12-30,30,12,2024,Monday,20241230


#### Merge holiday data

In [35]:
all_holiday = pd.read_csv(path+"AllHoliday_df.csv")
merged_df = pd.merge(calendar_df, all_holiday['holiday_id'], left_on='id_date', right_on='holiday_id', how='left')

# Create flag holiday
merged_df['flg_holiday'] = (merged_df['holiday_id'].notnull()) | (merged_df['day_week'].str.contains('Saturday|Sunday'))
# Convert boolean values to int
merged_df['flg_holiday'] = merged_df['flg_holiday'].astype(int)
merged_df.drop(columns=['holiday_id'], inplace=True)
merged_df

Unnamed: 0,date,day_month,month,year,day_week,id_date,flg_holiday
0,2022-01-01,1,1,2022,Saturday,20220101,1
1,2022-01-02,2,1,2022,Sunday,20220102,1
2,2022-01-03,3,1,2022,Monday,20220103,0
3,2022-01-04,4,1,2022,Tuesday,20220104,0
4,2022-01-05,5,1,2022,Wednesday,20220105,0
...,...,...,...,...,...,...,...
1091,2024-12-27,27,12,2024,Friday,20241227,0
1092,2024-12-28,28,12,2024,Saturday,20241228,1
1093,2024-12-29,29,12,2024,Sunday,20241229,1
1094,2024-12-30,30,12,2024,Monday,20241230,0


In [27]:
# Note: the calendar must be order for function shift
def get_day_type(row):
    if row['flg_holiday']:
        return 2
    elif merged_df['flg_holiday'].shift(1)[row.name]:  # Shift the entire 'holiday' column and access the value at the current row index
        return 1
    elif merged_df['flg_holiday'].shift(-1)[row.name]:
        return 1
    else:
        return 0

merged_df['day_type'] = merged_df.apply(get_day_type, axis=1)
merged_df

Unnamed: 0,date,day_month,month,year,day_week,id_date,flg_holiday,day_type
0,2022-01-01,1,1,2022,Saturday,20220101,1,2
1,2022-01-02,2,1,2022,Sunday,20220102,1,2
2,2022-01-03,3,1,2022,Monday,20220103,0,1
3,2022-01-04,4,1,2022,Tuesday,20220104,0,0
4,2022-01-05,5,1,2022,Wednesday,20220105,0,1
...,...,...,...,...,...,...,...,...
1091,2024-12-27,27,12,2024,Friday,20241227,0,1
1092,2024-12-28,28,12,2024,Saturday,20241228,1,2
1093,2024-12-29,29,12,2024,Sunday,20241229,1,2
1094,2024-12-30,30,12,2024,Monday,20241230,0,1


In [28]:
# Remove flg_holiday column
merged_df.drop(columns=['flg_holiday'], inplace=True)

# Map the weekday names to numerical values
merged_df['day_week'] = merged_df['day_week'].str.strip()
weekday_mapping = {'Monday': 1,'Tuesday': 2,'Wednesday': 3,'Thursday': 4,'Friday': 5,'Saturday': 6,'Sunday': 7}
merged_df['day_week'] = merged_df['day_week'].map(weekday_mapping)

merged_df

Unnamed: 0,date,day_month,month,year,day_week,id_date,day_type
0,2022-01-01,1,1,2022,6,20220101,2
1,2022-01-02,2,1,2022,7,20220102,2
2,2022-01-03,3,1,2022,1,20220103,1
3,2022-01-04,4,1,2022,2,20220104,0
4,2022-01-05,5,1,2022,3,20220105,1
...,...,...,...,...,...,...,...
1091,2024-12-27,27,12,2024,5,20241227,1
1092,2024-12-28,28,12,2024,6,20241228,2
1093,2024-12-29,29,12,2024,7,20241229,2
1094,2024-12-30,30,12,2024,1,20241230,1


#### Merge weather data

In [29]:
weather_df = pd.read_csv(path+"pastWeather20220101_20240410.csv")
weather_df

Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,precipitation_sum
0,2022-01-01,3.0,15.602500,6.5025,0.0
1,2022-01-02,3.0,14.302500,6.8025,0.0
2,2022-01-03,3.0,13.102500,5.6525,0.0
3,2022-01-04,80.0,11.752500,4.6525,2.7
4,2022-01-05,51.0,7.952500,3.0025,0.6
...,...,...,...,...,...
826,2024-04-06,3.0,22.976000,10.7260,0.0
827,2024-04-07,51.0,21.226000,12.0760,0.4
828,2024-04-08,53.0,16.776000,7.1760,0.6
829,2024-04-09,0.0,14.976000,3.6260,0.0


In [30]:
merged_df2 = pd.merge(merged_df, weather_df, left_on='date', right_on='date', how='left')
# merged_df2.to_csv(f"{path}OpenData_CalendarAll_df.csv", index=False)
merged_df2

Unnamed: 0,date,day_month,month,year,day_week,id_date,day_type,weather_code,temperature_2m_max,temperature_2m_min,precipitation_sum
0,2022-01-01,1,1,2022,6,20220101,2,3.0,15.6025,6.5025,0.0
1,2022-01-02,2,1,2022,7,20220102,2,3.0,14.3025,6.8025,0.0
2,2022-01-03,3,1,2022,1,20220103,1,3.0,13.1025,5.6525,0.0
3,2022-01-04,4,1,2022,2,20220104,0,80.0,11.7525,4.6525,2.7
4,2022-01-05,5,1,2022,3,20220105,1,51.0,7.9525,3.0025,0.6
...,...,...,...,...,...,...,...,...,...,...,...
1091,2024-12-27,27,12,2024,5,20241227,1,,,,
1092,2024-12-28,28,12,2024,6,20241228,2,,,,
1093,2024-12-29,29,12,2024,7,20241229,2,,,,
1094,2024-12-30,30,12,2024,1,20241230,1,,,,
