In [1]:
import pandas as pd
import numpy as np
from functions import *

Load the dataset (https://aster.istat.it/#/it/atlante_comuni/categories/CARTA_IDENTITA_COM/IT1,DF_CARTA_IDENTITA,1.0)

In [2]:
# Import datasets
df = pd.read_csv('C:/Users/HP/Desktop/Traineeship/data/Mun_Population_Area.csv')

# Drop ID column
df = df.drop(columns=['Comune (ID)'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8996 entries, 0 to 8995
Data columns (total 51 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Comune                                            8995 non-null   object 
 1   2024 Popolazione al 1º gennaio (numero abitanti)  7899 non-null   float64
 2   2024 Superficie (kmq)                             7892 non-null   float64
 3   2023 Popolazione al 1º gennaio (numero abitanti)  7895 non-null   float64
 4   2023 Superficie (kmq)                             7901 non-null   float64
 5   2022 Popolazione al 1º gennaio (numero abitanti)  7904 non-null   float64
 6   2022 Superficie (kmq)                             7904 non-null   float64
 7   2021 Popolazione al 1º gennaio (numero abitanti)  7903 non-null   float64
 8   2021 Superficie (kmq)                             7903 non-null   float64
 9   2020 Popolazione al

Pivoting the dataset on 'year'

In [3]:
# Rename columns
df = df.rename(columns={'Comune': 'mun_name'})

tidy = []
for col in df.columns:
    if "Popolazione" in col:
        year = col.split()[0]
        tidy.append(
            df[["mun_name", col, f"{year} Superficie (kmq)"]]
            .rename(columns={
                col: "population",
                f"{year} Superficie (kmq)": "surface"
            })
            .assign(year=int(year))
        )

# Concatenate into a long dataframe
df_long = pd.concat(tidy, ignore_index=True)

# Final clean ordering
df_long = df_long[["mun_name", "year", "population", "surface"]]

df_long.head(10)

Unnamed: 0,mun_name,year,population,surface
0,Agliè,2024,2596.0,13.1463
1,Airasca,2024,3686.0,15.7393
2,Ala di Stura,2024,472.0,46.3316
3,Albiano d'Ivrea,2024,1617.0,11.7397
4,Alice Superiore,2024,,
5,Almese,2024,6315.0,17.8741
6,Alpette,2024,246.0,5.6311
7,Alpignano,2024,16587.0,11.9193
8,Andezeno,2024,2001.0,7.4859
9,Andrate,2024,467.0,9.309


Some municipalities were merged to others in the years. I need a dataframe with municipalities updated to 2024.

In [4]:
# Import dataset with recorded municipal changes
df_change = pd.read_csv('C:/Users/HP/Desktop/Traineeship/data/Elenco comuni soppressi e non ricostituiti Data Indagine 17-03-1861 Stampa 14102025095708.csv', sep=';', index_col=False)

In [5]:
# Translate df_change and select new mun_name after 1991 (starting year in df_long)
col = ['Anno evento', 'Comune', 'Comune associato', 'Codice Comune associato']

dict = {
    'Anno evento':'year',
    'Comune':'mun_name',
    'Comune associato':'mun_new'
}

df_new = df_change[col][df_change['Anno evento'] >= 1991].rename(dict, axis=1)

df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 344 entries, 0 to 343
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     344 non-null    int64  
 1   mun_name                 344 non-null    object 
 2   mun_new                  344 non-null    object 
 3   Codice Comune associato  344 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 13.4+ KB


In [6]:
# Normalize mun_name in df_new and df_long
col = ['mun_name', 'mun_new']

for c in col:
    df_change[f"{c}_norm"] = df_new[c].apply(normalize_name).astype('object')

df_long['mun_name_norm'] = df_long['mun_name'].apply(normalize_name).astype('object')

In [7]:
# Merge df_long with new mun names
df_long = df_long.merge(df_change[['mun_name_norm', 'mun_new_norm']], on = 'mun_name_norm', how='left')

In [8]:
# Create column 'mun_final_name' where mun_new_norm is stored. Also, transfer mun_name_norm for municipalities that did not change. 
df_long['mun_final_name'] = df_long['mun_new_norm'].fillna(df_long['mun_name_norm'])

In [9]:
# Aggregate identical [mun_final_name, year] and sum 'population' and 'surface' 
df_agg = (
    df_long
    .groupby(['year', 'mun_final_name'], as_index=False)
    .agg({
        'population': 'sum',
        'surface': 'sum'
    })
)

# Rename 'mun_final_name' into 'mun_name_norm'
rename_dict = {
    'mun_final_name':'mun_name_norm'
}

df_agg = df_agg.rename(columns=rename_dict)

print("Before aggregation:", len(df_long['mun_name_norm'].unique()))
print("After aggregation:", len(df_agg['mun_name_norm'].unique()))

df_agg.info()

Before aggregation: 8219
After aggregation: 7891
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197275 entries, 0 to 197274
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   year           197275 non-null  int64  
 1   mun_name_norm  197275 non-null  object 
 2   population     197275 non-null  float64
 3   surface        197275 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 6.0+ MB


Select year >=2001 (starting date of continous data)

In [10]:
# Drop missing values and select data for year >= 2001
df_agg = df_agg.dropna(subset=['mun_name_norm'])
df_agg = df_agg[df_agg['year'] >= 2001]

Add Istat codes (https://situas.istat.it/web/#/home/piu-consultati?id=61&dateFrom=2025-10-10)

In [11]:
df_istat = pd.read_parquet('datasets/mun_istat_codes.parquet')

In [12]:
df_merged = pd.merge(df_agg, df_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (munisties without ISTAT)
unmatched = df_merged[df_merged["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name"].unique())

Unmatched names: [nan]


In [13]:
df_merged = df_merged.drop(columns = ['mun_name','prov_istat','reg_name','prov_name_norm','reg_name_norm','mun_key'])

Save the dataset

In [14]:
df_merged.to_parquet('datasets/mun_dem.parquet', index=False)