In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce 
from functions import *
from rapidfuzz import process, fuzz

Load datasets

In [98]:
df_house = pd.read_parquet('datasets/house_post_2014.parquet')

df_dwelling = pd.read_parquet('datasets/mun_dwelling.parquet') 

df_income = pd.read_parquet('datasets/mun_lag_income.parquet') 
df_income['year'] = df_income['year'] + 1 # shift birth rate by 1 year to match with house data


df_dem = pd.read_parquet('datasets/mun_dem.parquet')


df_tourism = pd.read_parquet('datasets/mun_tourism.parquet')

# macro

df_imm_emi = pd.read_parquet('datasets/macro/prov_lag_pop_movements.parquet')
df_imm_emi['year'] = df_imm_emi['year'] + 1 # shift birth rate by 1 year to match with house data


df_birth = pd.read_parquet('datasets/macro/lag_birth_rate.parquet')
df_birth['year'] = df_birth['year'] + 1 # shift birth rate by 1 year to match with house data


df_bond = pd.read_parquet('datasets/macro/lag_bond.parquet')
df_bond['year'] = df_bond['year'] + 1 # shift birth rate by 1 year to match with house data


df_cpi = pd.read_parquet('datasets/macro/lag_cpi.parquet')
df_cpi['year'] = df_cpi['year'] + 1 # shift birth rate by 1 year to match with house data


df_cpi_growth = pd.read_parquet('datasets/macro/lag_cpi_growth.parquet')
df_cpi_growth['year'] = df_cpi_growth['year'] + 1 # shift birth rate by 1 year to match with house data


df_gini = pd.read_parquet('datasets/macro/lag_gini.parquet')
df_gini['year'] = df_gini['year'] + 1 # shift birth rate by 1 year to match with house data


df_life = pd.read_parquet('datasets/macro/lag_life.parquet')
df_life['year'] = df_life['year'] + 1 # shift birth rate by 1 year to match with house data


df_real_gdp = pd.read_parquet('datasets/macro/lag_gdp_real.parquet')
df_real_gdp['year'] = df_real_gdp['year'] + 1 # shift birth rate by 1 year to match with house data


df_gdp_growth = pd.read_parquet('datasets/macro/lag_gdp_growth.parquet')
df_gdp_growth['year'] = df_gdp_growth['year'] + 1 # shift birth rate by 1 year to match with house data


df_unemployment_prov = pd.read_parquet('datasets/macro/prov_unemployment.parquet')


df_crime = pd.read_parquet('datasets/macro/lag_crime.parquet')
df_crime['year'] = df_crime['year'] + 1 # shift birth rate by 1 year to match with house data


df_reg_age = pd.read_parquet('datasets/macro/reg_age.parquet')

In [99]:
# istat codes updated to 2025
df_new_istat = pd.read_parquet('datasets/mun_istat_codes.parquet')

# istat codes changes
df_change = pd.read_parquet('datasets/changes_istat.parquet')

In [100]:
df_new_istat = df_new_istat[['mun_istat','mun_name_norm','prov_istat']]

1. update istat codes
2. check for istat codes that have not been changed -> manually check
3. merge

df_house

In [101]:
df_house_updated = update_istat(
    df=df_house,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

df_house_updated = df_house_updated.drop(columns = 'mun_istat')

In [102]:
# split in suppressed (no correspondance with latest istat codes) and non suppressed
suppressed_df = df_house_updated[df_house_updated['suppressed'] == True].copy()
non_suppressed_df = df_house_updated[df_house_updated['suppressed'] == False].copy()

In [103]:
# check for similarity in unmatched mun_name_norm (suppressed)
similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
134,loiri porto s paolo,loiri porto san paolo,95.000000
10,castellar,castellaro,94.736842
132,trinita agultu vignola,trinita dagultu e vignola,93.617021
73,vezzano,avezzano,93.333333
12,veruno,verduno,92.307692
...,...,...,...
78,zambana,agna,77.142857
120,acquacanina,acqualagna,76.190476
23,valsecca,valmacca,75.000000
79,zuclo,zuglio,72.727273


In [104]:
# manually sobstitute non-corresponding mun names
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'loiri porto s paolo':	'loiri porto san paolo',
'castellar' :	'castellaro',
'trinita agultu vignola':	'trinita dagultu e vignola',
's antonio di gallura':	'santantonio di gallura',
'tonengo' :	'moransengotonengo',
'piovera':	'alluvioni piovera',
'quaregna'	:'quaregna cerreto',
'cellio' :	'cellio con breia',
'lisignago' :	'cembra lisignago',
'condino':  	'castel condino',
'osmate'	: 'cadrezzate con osmate',
'veddasca' 	:'maccagno con pino e veddasca',
'malgesso'	:'bardello con malgesso e bregano',
'villa vicentina'	:'fiumicello villa vicentina',
'lusiana'	:'lusiana conco',
'vigolo vattaro'	:'vigolo',
'gravedona'	:'gravedona ed uniti',
'sorbolo'	:'sorbolo mezzani',
'san marcello pistoiese'	:'san marcello',
'presicce'	:'presicceacquarica',
'rossano'	:'rossano veneto',
'ripe'	:'ripe san ginesio',
'cutigliano'	:'abetone cutigliano',
'lorenzana'	:'crespina lorenzana',
'sillano'	:'sillano giuncugnano',
'zibello'	:'polesine zibello',
'piandisco':	'castelfranco piandisco',
'mossano'	 : 'barbarano mossano'
})

In [105]:
suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# check number of umatched cases
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop mun_istat
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
df_house_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

df_house_updated = df_house_updated.drop(columns = ['changed','suppressed'])

Unmatched names: 108


In [106]:
# group by ['mun_istat','year','sector','type','condition']
df_house_updated = df_house_updated.groupby(
        ['mun_istat_updated','year','sector','type','condition'],
        observed=True              
    ).agg({
    # keep one representative name
    'mun_key': 'first', 
    'mun_name': 'first', 
    'mun_name_norm': 'first',
    'region': 'first',
    'prov' : 'first',
    'log_buy_min' : 'mean',
    'log_buy_max' : 'mean',
    'log_buy_avg': 'mean'
}).reset_index()

df_house_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549552 entries, 0 to 549551
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   mun_istat_updated  549552 non-null  object  
 1   year               549552 non-null  int64   
 2   sector             549552 non-null  category
 3   type               549552 non-null  object  
 4   condition          549552 non-null  category
 5   mun_key            549552 non-null  object  
 6   mun_name           549552 non-null  object  
 7   mun_name_norm      549552 non-null  object  
 8   region             549552 non-null  category
 9   prov               549552 non-null  object  
 10  log_buy_min        549552 non-null  float64 
 11  log_buy_max        549552 non-null  float64 
 12  log_buy_avg        549552 non-null  float64 
dtypes: category(3), float64(3), int64(1), object(6)
memory usage: 43.5+ MB


In [107]:
# merge province istat code
df_house_updated['prov_istat'] = df_house_updated['mun_istat_updated'].astype(str).str[:3]

In [108]:
# Update region names
df_house_updated['region'] = df_house_updated['region'].replace({
    'ABRUZZO':	'Abruzzo',
    'BASILICATA':	'Basilicata',
    'CALABRIA':	'Calabria',
    'CAMPANIA':	'Campania',
    'EMILIA-ROMAGNA':	'Emilia Romagna',
    'FRIULI-VENEZIA GIULIA':	'Friuli Venezia Giulia',
    'LAZIO':	'Lazio',
    'LIGURIA':	'Liguria',
    'LOMBARDIA':	'Lombardia',
    'MARCHE':	'Marche',
    'MOLISE':	'Molise',
    'PIEMONTE':	'Piemonte',
    'PUGLIA':	'Puglia',
    'SARDEGNA':	'Sardegna',
    'SICILIA':	'Sicilia',
    'TOSCANA':	'Toscana',
    'TRENTINO-ALTO ADIGE':	'Trentino Alto Adige',
    'UMBRIA':	'Umbria',
    "VALLE D'AOSTA/VALLE`E D'AOSTE":	"Valle D'Aosta",
    'VENETO':	'Veneto'
})


  df_house_updated['region'] = df_house_updated['region'].replace({


In [109]:
# Count the number of duplicate listings
duplicates = df_house_updated.value_counts(subset=['mun_istat_updated', 'year', 'sector', 'type', 'condition'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 0


df_dwelling

In [110]:
df_dwelling_updated = update_istat(
    df=df_dwelling,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

df_dwelling_updated = df_dwelling_updated.drop(columns = ['mun_istat','occupation'])

df_dwelling_updated['mun_name_norm'] = df_dwelling_updated['mun_name'].apply(normalize_name)

In [111]:
# split in suppressed (no correspondance with latest istat codes) and non suppressed
suppressed_df = df_dwelling_updated[df_dwelling_updated['suppressed'] == True].copy()
non_suppressed_df = df_dwelling_updated[df_dwelling_updated['suppressed'] == False].copy()

In [112]:
similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
0,malgesso,bardello con malgesso e bregano,90.0
1,tonengo,moransengotonengo,90.0
2,uggiatetrevano,re,90.0
3,vighizzolo deste,este,90.0
4,quero vas,crova,72.0


In [113]:
# manually sobstitute non-corresponding mun_names
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'malgesso' : 'bardello con malgesso e bregano',
    'tonengo' : 'moransengotonengo'
})

suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (without ISTAT)
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop mun_istat
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
df_dwelling_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

df_dwelling_updated = df_dwelling_updated.drop(columns = ['changed','suppressed','mun_name_norm','mun_name','prov_istat'])

Unmatched names: 3


In [114]:
# Count the number of duplicate listings
duplicates = df_dwelling_updated.value_counts(subset=['mun_istat_updated'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 9


In [115]:
# Delete duplicate listings for the same istat code - keep the first occurrence
df_dwelling_updated = df_dwelling_updated.drop_duplicates(subset=['mun_istat_updated'], keep='first')

df_tourism

In [116]:
df_tourism_updated = update_istat(
    df=df_tourism,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

df_tourism_updated['mun_name_norm'] = df_tourism_updated['mun_name'].apply(normalize_name)

df_tourism_updated = df_tourism_updated.drop(columns = ['mun_istat','region','mun_name'])

In [117]:
suppressed_df = df_tourism_updated[df_tourism_updated['suppressed'] == True].copy()
non_suppressed_df = df_tourism_updated[df_tourism_updated['suppressed'] == False].copy()

similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
25,castellar,castellaro,94.736842
92,vezzano,avezzano,93.333333
23,veruno,verduno,92.307692
13,varena,varenna,92.307692
90,tuenno,tenno,90.909091
...,...,...,...
53,cavacurta,cave,77.142857
73,acquacanina,acqualagna,76.190476
106,valsecca,valmacca,75.000000
96,zuclo,zuglio,72.727273


In [118]:
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'tonengo':	'moransengotonengo',
'malgesso':	'bardello con malgesso e bregano',
'osmate':	'cadrezzate con osmate',
'piovera':	'alluvioni piovera',
'cellio':	'cellio con breia',
'quaregna':	'quaregna cerreto',
'piadena':	'piadena drizzona',
'sorbolo': 'sorbolo mezzani',
'villa vicentina':	'fiumicello villa vicentina',
'mossano':	'barbarano mossano',
'lusiana':	'lusiana conco',
'presicce':	'presicceacquarica',
'cornale':	'cornale e bastida',
'veddasca':	'maccagno con pino e veddasca',
'sillano':	'sillano giuncugnano',
'zibello':	'polesine zibello',
'cutigliano':	'abetone cutigliano',
'lisignago':	'cembra lisignago'
})

suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (without ISTAT)
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop mun_istat
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
df_tourism_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

df_tourism_updated = df_tourism_updated.drop(columns = ['changed','suppressed','mun_name_norm','prov_istat'])

Unmatched names: 97


In [119]:
# Count the number of duplicate listings
duplicates = df_tourism_updated.value_counts(subset=['mun_istat_updated','year'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 537


In [120]:
# delete duplicates for the same istat code and year - keep the first occurence
df_tourism_updated = df_tourism_updated.drop_duplicates(subset = ['mun_istat_updated', 'year'], keep = 'first')

df_dem

In [121]:
df_dem_updated = update_istat(
    df=df_dem,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

df_dem_updated = df_dem_updated.drop(columns = ['mun_istat','prov_name'])

In [122]:
suppressed_df = df_dem_updated[df_dem_updated['suppressed'] == True].copy()
non_suppressed_df = df_dem_updated[df_dem_updated['suppressed'] == False].copy()

similarity = similarity_score(suppressed_df, df_new_istat, col = 'mun_name_norm')
similarity

Unnamed: 0,Name in df1,Name in df2,Similarity score (0-100)
11,san floriano del collioteverjan,san floriano del colliosteverjan,98.412698
12,savogna disonzosovodnje ob soi,savogna disonzosovodnje ob soci,98.360656
2,duino aurisinadevin nabreina,duino aurisinadevin nabrezina,98.245614
8,pontcanavese,pont canavese,96.0
3,grana,monterosso grana,90.0
0,campospinoso,campospinoso albaredo,90.0
1,casorzo,casorzo monferrato,90.0
14,tripi,tripi abakainon,90.0
6,montemagno,montemagno monferrato,90.0
5,montagnamontan,monta,90.0


In [123]:
suppressed_df['mun_name_norm'] = suppressed_df['mun_name_norm'].replace({
    'san floriano del collioteverjan':	'san floriano del colliosteverjan',
'savogna disonzosovodnje ob soi':	'savogna disonzosovodnje ob soci',
'duino aurisinadevin nabreina':	'duino aurisinadevin nabrezina',
'pontcanavese':	'pont canavese',
'grana':	'monterosso grana',
'campospinoso':	'campospinoso albaredo',
'casorzo':	'casorzo monferrato',
'tripi':	'tripi  abakainon',
'montemagno':	'montemagno monferrato',
'ionadi':	'jonadi'
})

suppressed_df = pd.merge(suppressed_df, df_new_istat, on = ['mun_name_norm'], how = 'left')

# see unmatched (without ISTAT)
unmatched = suppressed_df[suppressed_df["mun_istat"].isna()]
print("Unmatched names:", unmatched["mun_name_norm"].nunique())

# drop rows where new ISTAT could not be found
suppressed_df = suppressed_df[suppressed_df['mun_istat'].notna()]

# replace ISTAT code in suppressed_df
suppressed_df['mun_istat_updated'] = suppressed_df['mun_istat']

# drop mun_istat
suppressed_df = suppressed_df.drop(columns=['mun_istat'])

# concatenate with non-suppressed rows
df_dem_updated = pd.concat([non_suppressed_df, suppressed_df], ignore_index=True)

df_dem_updated = df_dem_updated.drop(columns = ['changed','suppressed','mun_name_norm','prov_istat'])

Unmatched names: 5


In [124]:
# Count the number of duplicate listings
duplicates = df_dem_updated.value_counts(subset=['mun_istat_updated','year'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 96


In [125]:
df_dem_updated = df_dem_updated.drop_duplicates(subset = ['mun_istat_updated','year'], keep = 'first')

df_income (no mun_name_norm)

In [126]:
df_income_updated = update_istat(
    df=df_income,
    df_map=df_change, 
    valid_codes=df_new_istat["mun_istat"], 
    istat_col="mun_istat",
    istat_old = "mun_istat_old",
    istat_new = "mun_istat_new"
)

df_income_updated = df_income_updated.drop(columns = ['mun_istat'])

In [127]:
# print columns were suppressed is true and check istat codes
suppressed_df = df_income_updated[df_income_updated['suppressed'] == True].copy()

In [128]:
# manually sobstitute istat codes
df_income_updated['mun_istat_updated'] = df_income_updated['mun_istat_updated'].replace({
    '090001':	'113001',
'090002':	'113003',
'090006':	'113004',
'090009':	'113006',
'090014':	'113007',
'090017':	'113008',
'090021':	'113010',
'090035':	'113012',
'090036':	'113014',
'090037':	'113015',
'090041':	'113016',
'090044':	'112034',
'090047':	'113017',
'090049':	'113018',
'090054':	'113020',
'090062':	'113002',
'090063':	'113022',
'090070':	'113025',
'090074':	'113026',
'090080':	'113024',
'090081':	'113005',
'090083':	'113011',
'090084':	'113013',
'090085':	'113023',
'090090':	'113019',
'091002':	'116001',
'091005':	'116002',
'091006':	'116003',
'091019':	'116005',
'091026':	'116006',
'091031':	'116007',
'091032':	'116008',
'091035':	'116009',
'091037':	'116010',
'091039':	'116011',
'091042':	'116012',
'091069':	'116013',
'091072':	'116014',
'091088':	'116015',
'091089':	'116016',
'091095':	'116017',
'091097':	'116018',
'091098':	'116019',
'091099':	'116020',
'091100':	'116021',
'091101':	'116022',
'091103':	'116004',
'092009':	'118006',
'092109':	'118007',
'095006':	'115006',
'095037':	'115044',
'095065':	'115075',
'095067':	'115078',
'095083':	'115029',
'097080':	'016215',
'104001':	'113001',
'104002':	'113002',
'104003':	'113003',
'104004':	'113004',
'104005':	'113005',
'104006':	'113006',
'104007':	'113007',
'104009':	'113008',
'104010':	'113010',
'104011':	'113011',
'104012':	'113012',
'104013':	'113013',
'104014':	'113014',
'104015':	'113015',
'104016':	'113016',
'104017':	'113017',
'104018':	'113018',
'104020':	'113020',
'104021':	'113023',
'104022':	'113022',
'104023':	'113021',
'104024':	'113024',
'104025':	'113025',
'104026':	'113026',
'105001':	'116001',
'105002':	'116002',
'105003':	'116003',
'105004':	'116004',
'105005':	'116005',
'105006':   '116006',
'105007':	'116007',
'105008':	'116008',
'105009':	'116009',
'105010':	'116010',
'105011':	'116011',
'105012':	'116012',
'105013':	'116013',
'105014':	'116014',
'105016':	'116015',
'105017':	'116016',
'105018':	'116017',
'105019':	'116018',
'105020':	'116019',
'105021':	'116020',
'105022':	'116021',
'105023':	'116022'
})


In [129]:
# Count the number of duplicate listings
duplicates = df_income_updated.value_counts(subset=['mun_istat_updated','year'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 409


In [130]:
df_income_updated = df_income_updated.drop_duplicates(subset = ['mun_istat_updated', 'year'], keep = 'first')

Merge

In [131]:
# merge national house, income, demographic, and tourism data on [mun_istat_updated, year]
dfs1 = [df_house_updated, df_dem_updated, df_income_updated, df_tourism_updated]

df = reduce(lambda left, right: pd.merge(left, right, on = ['mun_istat_updated','year'], how = 'left'), dfs1)

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549552 entries, 0 to 549551
Data columns (total 35 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   mun_istat_updated        549552 non-null  object  
 1   year                     549552 non-null  int64   
 2   sector                   549552 non-null  category
 3   type                     549552 non-null  object  
 4   condition                549552 non-null  category
 5   mun_key                  549552 non-null  object  
 6   mun_name                 549552 non-null  object  
 7   mun_name_norm            549552 non-null  object  
 8   region                   549552 non-null  category
 9   prov                     549552 non-null  object  
 10  log_buy_min              549552 non-null  float64 
 11  log_buy_max              549552 non-null  float64 
 12  log_buy_avg              549552 non-null  float64 
 13  prov_istat               549552 non-null  ob

In [133]:
# merge dwellings count on [mun_istat]
df = pd.merge(df, df_dwelling_updated, on = ['mun_istat_updated'], how = 'left')

In [134]:
# drop null values
df = df.dropna()

Merge macros

In [135]:
# merge df_real_gdp, df_cpi. df_gini, df_unemployment, df_birth, df_life on [year]
dfs2 = [df, df_real_gdp, df_gdp_growth, df_cpi, df_cpi_growth, df_gini, df_birth, df_life, df_crime]

df = reduce(lambda left, right: pd.merge(left, right, on = ['year'], how = 'left'), dfs2)

In [136]:
# merge df_imm_emi on [prov, year]
df = pd.merge(df, df_imm_emi, on = ['prov', 'year'], how = 'left')

In [137]:
df_house_updated[df_house_updated['prov'] == 'NA']

Unnamed: 0,mun_istat_updated,year,sector,type,condition,mun_key,mun_name,mun_name_norm,region,prov,log_buy_min,log_buy_max,log_buy_avg,prov_istat
344044,063001,2014,B,independent houses and villas,normal,acerra_campania,ACERRA,acerra,Campania,,7.046571,7.466963,7.256767,063
344045,063001,2014,B,lowcost housing,normal,acerra_campania,ACERRA,acerra,Campania,,6.658965,7.068797,6.863881,063
344046,063001,2014,B,residential housing,normal,acerra_campania,ACERRA,acerra,Campania,,6.979805,7.423457,7.201631,063
344047,063001,2014,C,independent houses and villas,normal,acerra_campania,ACERRA,acerra,Campania,,7.068797,7.481457,7.275127,063
344048,063001,2014,C,lowcost housing,normal,acerra_campania,ACERRA,acerra,Campania,,6.703019,7.110488,6.906753,063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351938,063092,2023,D,residential housing,normal,massa di somma_campania,MASSA DI SOMMA,massa di somma,Campania,,7.047517,7.467371,7.257444,063
351939,063092,2024,B,lowcost housing,normal,massa di somma_campania,MASSA DI SOMMA,massa di somma,Campania,,6.709304,7.130899,6.920102,063
351940,063092,2024,B,residential housing,normal,massa di somma_campania,MASSA DI SOMMA,massa di somma,Campania,,7.090077,7.495542,7.292809,063
351941,063092,2024,D,lowcost housing,normal,massa di somma_campania,MASSA DI SOMMA,massa di somma,Campania,,6.659294,7.090077,6.874685,063


Update prov names

In [138]:
df['prov'] = df['prov_istat']

df['prov'] = df['prov'].replace({
    '084':	'AG',
'006':	'AL',
'042':	'AN',
'043':	'MC',
'007':	'AO',
'044':	'AP',
'109':	'FM',
'066':	'AQ',
'050':	'PI',
'051':	'AR',
'005':	'AT',
'064':	'AV',
'072':	'BA',
'110':	'BT',
'016':	'BG',
'096':	'BI',
'025':	'BL',
'030':	'UD',
'037':	'BO',
'062':  'BN',
'074':	'BR',
'017':	'BS',
'021':	'BZ',
'117':	'VS',
'118':	'CA',
'119':	'CI',
'070':	'CB',
'061':	'CE',
'069':	'CH',
'085':	'CL',
'004':	'CN',
'008':	'IM',
'013':	'CO',
'019':	'CR',
'024':	'VI',
'078':	'CS',
'087':	'CT',
'079':	'CZ',
'086':	'EN',
'038':	'FE',
'071':	'FG',
'110':	'BT',
'048':	'FI',
'040':	'FC',
'060':	'FR',
'010':	'GE',
'030':	'UD',
'031':	'GO',
'053':	'GR',
'008':	'IM',
'094':	'IS',
'101':	'KR',
'016':	'BG',
'097':	'LC',
'075':	'LE',
'049':	'LI',
'098':	'LO',
'059':	'LT',
'046':	'LU',
'043':	'MC',
'083':	'ME',
'015':	'MI',
'108':	'MB',
'020':	'MN',
'036':	'MO',
'045':	'MS',
'077':	'MT',
'063':	'NAP',
'003':	'NO',
'114':	'NU',
'115':	'OR',
'116':	'OG',
'118':	'CA',
'115':	'OR',
'082':	'PA',
'033':	'PC',
'028':	'PD',
'068':	'PE',
'054':	'PG',
'050':	'PI',
'093':	'PN',
'100':	'PO',
'034':	'PR',
'041':	'PU',
'099':	'RN',
'042':	'AN',
'047':	'PT',
'018':	'PV',
'076':	'PZ',
'039':	'RA',
'080':	'RC',
'035':	'RE',
'088':	'RG',
'057':	'RI',
'058':	'RM',
'099':	'RN',
'029':	'RO',
'065':	'SA',
'052':	'SI',
'014':	'SO',
'011':	'SP',
'089':	'SR',
'112':	'SS',
'113':	'OT',
'009':	'SV',
'073':	'TA',
'067':	'TE',
'016':	'BG',
'022':	'TN',
'001':	'TO',
'081':	'TP',
'055':	'TR',
'032':	'TS',
'026':	'TV',
'030':	'UD',
'012':	'VA',
'103':	'VB',
'002':	'VC',
'027':	'VE',
'024':	'VI',
'023':	'VR',
'056':	'VT',
'102':	'VV'
})

Merge unemployment_prov on [prov, year]

In [139]:
df = pd.merge(df, df_unemployment_prov, on = ['prov','year'], how = 'left')

Merge reg_age on [region, year]

In [140]:
df = pd.merge(df, df_reg_age, on = ['region','year'], how = 'left')

Check

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548371 entries, 0 to 548370
Data columns (total 52 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   mun_istat_updated        548371 non-null  object  
 1   year                     548371 non-null  int64   
 2   sector                   548371 non-null  category
 3   type                     548371 non-null  object  
 4   condition                548371 non-null  category
 5   mun_key                  548371 non-null  object  
 6   mun_name                 548371 non-null  object  
 7   mun_name_norm            548371 non-null  object  
 8   region                   548371 non-null  object  
 9   prov                     548371 non-null  object  
 10  log_buy_min              548371 non-null  float64 
 11  log_buy_max              548371 non-null  float64 
 12  log_buy_avg              548371 non-null  float64 
 13  prov_istat               548371 non-null  ob

In [142]:
print('Number of data points lost from from df_house:', len(df_house_updated) - len(df))
print('Percentage: ', ((len(df_house_updated) - len(df))/len(df_house)) * 100)

Number of data points lost from from df_house: 1181
Percentage:  0.0867681189979568


In [143]:
df = df.rename(columns = {
    'mun_istat_updated' : 'mun_istat'
})

In [144]:
# Count the number of duplicate listings
duplicates = df.value_counts(subset=['mun_istat', 'sector', 'condition', 'type', 'year'])

duplicates = duplicates[duplicates > 1]

print("Number of duplicate listings for the same semester:", duplicates.sum())

Number of duplicate listings for the same semester: 0


Save dataset

In [145]:
df.to_parquet('datasets/pre_processing.parquet', index = False)