In [52]:
import pandas as pd
import geopandas as gpd
from functions import *

Load dataset (https://www.istat.it/notizia/confini-delle-unita-amministrative-a-fini-statistici-al-1-gennaio-2018-2/)

In [53]:
gdf = gpd.read_file('C:/Users/HP/Desktop/Traineeship/data/Com01012025/Com01012025_WGS84.shp')

Rename + select columns

In [54]:
# rename PRO_COM_T in mun_istat
gdf = gdf.rename(columns = {
    'PRO_COM_T' : 'mun_istat',
    'COMUNE' : 'mun_name'
})

col_to_keep = [
    'mun_name',
    'CC_UTS',
    'Shape_Leng',
    'Shape_Area',
    'geometry',
    'mun_istat'
]

gdf = gdf[col_to_keep]

Upload dataset with new istat codes

In [55]:
df_new_istat = pd.read_parquet('datasets/mun_istat_codes.parquet')

df_new_istat = df_new_istat[['mun_istat','mun_name_norm']]

df_change = pd.read_parquet('datasets/changes_istat.parquet')

Update istat codes

In [56]:
# update mun istat
gdf_updated = update_istat(
    df=gdf,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

gdf_updated['mun_name_norm'] = gdf_updated['mun_name'].apply(normalize_name)

gdf_updated = gdf_updated.drop(columns = ['mun_istat','mun_name'])

In [57]:
suppressed_df = gdf_updated[gdf_updated['suppressed'] == True].copy()
non_suppressed_df = gdf_updated[gdf_updated['suppressed'] == False].copy()

similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
3,trinita dagultu e vignola,trinita dagultu e vignola,98.039216
0,ala dei sardi,ala dei sardi,96.296296
2,nughedu san nicola2,nughedu san nicolo,91.891892
4,tortola,tortona,85.714286
1,buddusa2,budduso,80.0


In [58]:
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'trinita  dagultu e vignola' :	'trinita dagultu e vignola',
	'ala  dei sardi' : 'ala dei sardi',
	'nughedu san nicola2' : 'nughedu san nicolo',
	'tortola' : 'tortona',
	'buddusa2' : 'budduso'
})

suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (without ISTAT)
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop helper column
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
gdf_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

gdf_updated = gdf_updated.drop(columns = ['changed','suppressed','mun_name_norm'])

gdf_updated = gdf_updated.rename(columns = {'mun_istat_updated' : 'mun_istat'})


Unmatched names: 0


Save the dataset (GeoPackage because ShapeFile does not support large number values)

In [59]:
gdf_updated.to_file("datasets/mun_gis_data/mun_map_updated.gpkg", driver="GPKG", layer="municipalities")