In [16]:
import pandas as pd
import geopandas as gpd
from functions import *

Load dataset (https://www.istat.it/notizia/confini-delle-unita-amministrative-a-fini-statistici-al-1-gennaio-2018-2/)

In [17]:
gdf = gpd.read_file('C:/Users/HP/Desktop/Traineeship/data/Com01012025/Com01012025_WGS84.shp')

Rename + select columns

In [18]:
# rename PRO_COM_T in mun_istat
gdf = gdf.rename(columns = {
    'PRO_COM_T' : 'mun_istat',
    'COMUNE' : 'mun_name'
})

col_to_keep = [
    'mun_name',
    'CC_UTS',
    'Shape_Leng',
    'Shape_Area',
    'geometry',
    'mun_istat'
]

gdf = gdf[col_to_keep]

Upload dataset with new istat codes

In [19]:
df_new_istat = pd.read_parquet('datasets/mun_istat_codes.parquet')

df_new_istat = df_new_istat[['mun_istat','mun_name_norm']]

df_change = pd.read_parquet('datasets/changes_istat.parquet')

Update istat codes

In [20]:
# update mun istat
gdf_updated = update_istat(
    df=gdf,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

gdf_updated['mun_name_norm'] = gdf_updated['mun_name'].apply(normalize_name)

gdf_updated = gdf_updated.drop(columns = ['mun_istat','mun_name'])

In [21]:
suppressed_df = gdf_updated[gdf_updated['suppressed'] == True].copy()
non_suppressed_df = gdf_updated[gdf_updated['suppressed'] == False].copy()

similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
3,trinita dagultu e vignola,trinita dagultu e vignola,98.039216
0,ala dei sardi,ala dei sardi,96.296296
2,nughedu san nicola2,nughedu san nicolo,91.891892
4,tortola,tortona,85.714286
1,buddusa2,budduso,80.0


In [22]:
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'trinita  dagultu e vignola' :	'trinita dagultu e vignola',
	'ala  dei sardi' : 'ala dei sardi',
	'nughedu san nicola2' : 'nughedu san nicolo',
	'tortola' : 'tortona',
	'buddusa2' : 'budduso'
})

suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (without ISTAT)
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop helper column
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
gdf_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

gdf_updated = gdf_updated.drop(columns = ['changed','suppressed','mun_name_norm'])

gdf_updated = gdf_updated.rename(columns = {'mun_istat_updated' : 'mun_istat'})


Unmatched names: 0


In [23]:
gdf_updated.head()

Unnamed: 0,CC_UTS,Shape_Leng,Shape_Area,geometry,mun_istat
0,0,18035.254001,13146260.0,"POLYGON ((404703.561 5026682.656, 404733.562 5...",1001
1,0,18408.906988,15739310.0,"POLYGON ((380700.909 4977305.52, 380702.627 49...",1002
2,0,31834.156081,46331560.0,"POLYGON ((364710.856 5022090.677, 364725.872 5...",1003
3,0,18927.262827,11739690.0,"POLYGON ((415942.51 5033304.612, 415962.885 50...",1004
4,0,17057.413964,17874120.0,"POLYGON ((376934.55 4999073.855, 376941.551 49...",1006


Update province codes

In [24]:
gdf_updated['prov'] = gdf_updated['mun_istat'].str[:3]

gdf_updated['prov'] = gdf_updated['prov'].replace({
    '084':	'AG',
'006':	'AL',
'042':	'AN',
'043':	'MC',
'007':	'AO',
'044':	'AP',
'109':	'FM',
'066':	'AQ',
'050':	'PI',
'051':	'AR',
'005':	'AT',
'064':	'AV',
'072':	'BA',
'110':	'BT',
'016':	'BG',
'096':	'BI',
'025':	'BL',
'030':	'UD',
'037':	'BO',
'062':  'BN',
'074':	'BR',
'017':	'BS',
'021':	'BZ',
'117':	'VS',
'118':	'CA',
'119':	'CI',
'070':	'CB',
'061':	'CE',
'069':	'CH',
'085':	'CL',
'004':	'CN',
'008':	'IM',
'013':	'CO',
'019':	'CR',
'024':	'VI',
'078':	'CS',
'087':	'CT',
'079':	'CZ',
'086':	'EN',
'038':	'FE',
'071':	'FG',
'110':	'BT',
'048':	'FI',
'040':	'FC',
'060':	'FR',
'010':	'GE',
'030':	'UD',
'031':	'GO',
'053':	'GR',
'008':	'IM',
'094':	'IS',
'101':	'KR',
'016':	'BG',
'097':	'LC',
'075':	'LE',
'049':	'LI',
'098':	'LO',
'059':	'LT',
'046':	'LU',
'043':	'MC',
'083':	'ME',
'015':	'MI',
'108':	'MB',
'020':	'MN',
'036':	'MO',
'045':	'MS',
'077':	'MT',
'063':	'NA',
'003':	'NO',
'114':	'NU',
'115':	'OR',
'116':	'OG',
'118':	'CA',
'115':	'OR',
'082':	'PA',
'033':	'PC',
'028':	'PD',
'068':	'PE',
'054':	'PG',
'050':	'PI',
'093':	'PN',
'100':	'PO',
'034':	'PR',
'041':	'PU',
'099':	'RN',
'042':	'AN',
'047':	'PT',
'018':	'PV',
'076':	'PZ',
'039':	'RA',
'080':	'RC',
'035':	'RE',
'088':	'RG',
'057':	'RI',
'058':	'RM',
'099':	'RN',
'029':	'RO',
'065':	'SA',
'052':	'SI',
'014':	'SO',
'011':	'SP',
'089':	'SR',
'112':	'SS',
'113':	'OT',
'009':	'SV',
'073':	'TA',
'067':	'TE',
'016':	'BG',
'022':	'TN',
'001':	'TO',
'081':	'TP',
'055':	'TR',
'032':	'TS',
'026':	'TV',
'030':	'UD',
'012':	'VA',
'103':	'VB',
'002':	'VC',
'027':	'VE',
'024':	'VI',
'023':	'VR',
'056':	'VT',
'102':	'VV'
})

Save the dataset (GeoPackage because ShapeFile does not support large number values)

In [26]:
gdf_updated.to_file("datasets/mun_gis_data/mun_map_updated.gpkg", driver="GPKG", layer="municipalities")