In [1]:
import pandas as pd
from functions import *

Load the data

In [2]:
file_path = 'C:/Users/HP/Desktop/Traineeship/data/reg_age.xlsx'

xls = pd.ExcelFile(file_path)

sheets = pd.read_excel(xls, sheet_name=None)

print(sheets.keys())

dict_keys(['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'])


In [3]:
df_list = []

for year in sheets:
    df = pd.read_excel(file_path, sheet_name=year, skiprows=[0])

    # Clean Fascia
    df['Fascia'] = df['Fascia'].replace({'100+': 100})
    df['Fascia'] = pd.to_numeric(df['Fascia'], errors='coerce')
    df = df.dropna(subset=['Fascia'])
    df['Fascia'] = df['Fascia'].astype(int)

    # Remove 'Italia' column
    df = df.drop(columns=['Italia'], errors='ignore')

    # Region columns
    region_cols = df.columns.drop('Fascia')
    df[region_cols] = (
        df[region_cols]
        .astype(str)
        .replace(r'\.', '', regex=True)
        .apply(pd.to_numeric, errors='coerce')
    )

    # Weighted averages
    weighted_avg = (
        df[region_cols].mul(df['Fascia'], axis=0).sum()
        / df[region_cols].sum()
    )

    over65 = (
        ((df.loc[df['Fascia'] >= 65, region_cols].sum()) / df[region_cols].sum())
        .rename('over65')
    )

    result = (
        weighted_avg
        .rename('reg_age_avg')
        .reset_index()
        .rename(columns={'index': 'region'})
    )

    result['year'] = int(year)

    result = pd.merge(result, over65.reset_index().rename(columns={'index': 'region'}), on='region')

    df_list.append(result)

df = pd.concat(df_list, ignore_index=True)


Save the data

In [4]:
df.to_parquet('datasets/macro/reg_age.parquet', index=None)