In [7]:
import pandas as pd
import glob
import os
import hashlib

Collecting CSV files

In [8]:
folder = "C:/Users/HP/Desktop/Traineeship/data_istat/values"

# Grab all CSV files
all_files = glob.glob(os.path.join(folder, "*.csv"))

Definition of unique ID

In [9]:
# Define columns that should uniquely identify a building (!!!!!!!!!!!!!!!!!)
id_columns = [
    'Regione', 'Prov', 'Comune_ISTAT', 'Comune_cat', 'Sez', 
    'Comune_amm', 'Comune_descrizione', 'Fascia', 'Zona', 
    'Sup_NL_compr', 'Sup_NL_loc', 'Descr_Tipologia'
]

# Function to create a unique building ID
def make_building_id(row):
    s = "_".join([str(row.get(col, '')) for col in id_columns])
    return hashlib.md5(s.encode()).hexdigest()

Extracting and cleaning file names (to define semesters) and assigning a unique ID to each building

In [10]:
dfs = []

for f in all_files:
    # Extract filename without extension
    filename = os.path.splitext(os.path.basename(f))[0]
    
    # Extract semester code
    parts = filename.split("_")
    semester_code = parts[-2]  # second to last part
    year = semester_code[:4]
    sem = semester_code[4]
    semester = f"{year}_S{sem}"
    
    # Read CSV, skip first title line
    df = pd.read_csv(f, sep=';', skiprows=1)
    
    # Strip whitespace and remove BOM from column names
    df.columns = df.columns.str.strip().str.replace('\ufeff','')
    
    # Keep only columns that exist in this CSV
    existing_id_cols = [col for col in id_columns if col in df.columns]
    if not existing_id_cols:
        raise ValueError(f"No ID columns found in {f}! Columns in file: {df.columns.tolist()}")
    
    # Concatenate static columns for building_id
    df['building_id'] = (
        df[existing_id_cols]
        .fillna('')
        .astype(str)
        .agg('_'.join, axis=1)
    )
    
    # Hash concatenated string
    df['building_id'] = df['building_id'].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
    
    # Add semester column
    df['semester'] = semester
    
    # Convert numeric columns safely
    numeric_cols = ['Loc_min', 'Loc_max', 'Compr_min', 'Compr_max', 'Sup_NL_compr', 'Sup_NL_loc']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.replace(',', '.', regex=False)
            )
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    dfs.append(df)



Merging datasets

In [11]:
# Combine all data into one long-format DataFrame
long_df = pd.concat(dfs, ignore_index=True)

print(long_df.head())

  Area_territoriale   Regione Prov  Comune_ISTAT Comune_cat Sez Comune_amm  \
0        NORD-OVEST  PIEMONTE   AL     1006003.0       A2AA           A182   
1        NORD-OVEST  PIEMONTE   AL     1006003.0       A2AA           A182   
2        NORD-OVEST  PIEMONTE   AL     1006003.0       A2AA           A182   
3        NORD-OVEST  PIEMONTE   AL     1006003.0       A2AA           A182   
4        NORD-OVEST  PIEMONTE   AL     1006003.0       A2AA           A182   

  Comune_descrizione Fascia Zona  ... Stato_prev  Compr_min Compr_max  \
0        ALESSANDRIA      B   B1  ...                1110.0    1670.0   
1        ALESSANDRIA      B   B1  ...                1010.0    1520.0   
2        ALESSANDRIA      B   B1  ...                 610.0     840.0   
3        ALESSANDRIA      B   B1  ...                 490.0     660.0   
4        ALESSANDRIA      B   B1  ...                1180.0    1410.0   

  Sup_NL_compr Loc_min  Loc_max  Sup_NL_loc  Unnamed: 21  \
0          NaN     3.7      5.6 

Deleting duplicates and saving long_df as merged_real_estate.csv

In [12]:
# Check the number of rows in the combined dataset
num_rows = len(long_df)
print(f"Total number of rows in the combined dataset: {num_rows}")

# Check for duplicates on building_id and semester
duplicates_count = long_df.duplicated(subset=['building_id', 'semester']).sum()
print(f"Number of duplicate entries: {duplicates_count}")

exp_final_dimension = num_rows - duplicates_count
print(f"Expected final dimension after removing duplicates: {exp_final_dimension}")

# Remove duplicates
long_df = long_df.drop_duplicates(subset=['building_id', 'semester'])
final_dimension = len(long_df)
print(f"Final dimension after removing duplicates: {final_dimension}")

Total number of rows in the combined dataset: 7201300
Number of duplicate entries: 499664
Expected final dimension after removing duplicates: 6701636
Final dimension after removing duplicates: 6701636


Saving the dataset

In [13]:
long_df.to_csv('C:/Users/HP/Desktop/Traineeship/Code/merged_real_estate.csv', sep=',', index=False, encoding='utf-8-sig')