In [2]:
import pandas as pd
import numpy as np
import glob
import os
import hashlib
import matplotlib.pyplot as plt
import seaborn as sns

Collecting CSV files

In [3]:
folder = "C:/Users/HP/Desktop/Traineeship/data_OMI/values"

# Grab all CSV files
all_files = glob.glob(os.path.join(folder, "*.csv"))

Assigning a unique ID for each listing

In [4]:
# Define columns that should uniquely identify a listing 
id_columns = [
    'Prov', 'Comune_ISTAT', 'Comune_descrizione', 'Fascia', 'Zona', 'Descr_Tipologia', 'Stato'
]

dfs = []

for f in all_files:
    # Extract filename without extension
    filename = os.path.splitext(os.path.basename(f))[0]
    
    # Extract semester code
    parts = filename.split("_")
    semester_code = parts[-2]  # second to last part
    year = semester_code[:4]
    sem = semester_code[4]
    semester = f"{year}_S{sem}"
    
    # Read CSV, skip first title line
    df = pd.read_csv(f, sep=';', skiprows=1)
    
    # Strip whitespace and remove BOM from column names
    df.columns = df.columns.str.strip().str.replace('\ufeff','')
    
    # Hash concatenated string
    df['listing_id'] = pd.util.hash_pandas_object(df[id_columns].fillna(''), index=False).astype(str)

    # Add semester column
    df['semester'] = semester
    
    # Convert numeric columns to numeric type
    numeric_cols = ['Compr_min', 'Compr_max', 'Loc_min', 'Loc_max']

    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '.', regex=False), errors='coerce')
    
    dfs.append(df)

Merging datasets

In [5]:
# Combine all data into one long-format DataFrame
long_df = pd.concat(dfs, ignore_index=True)

long_df.head()

long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7201300 entries, 0 to 7201299
Data columns (total 24 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Area_territoriale   object 
 1   Regione             object 
 2   Prov                object 
 3   Comune_ISTAT        float64
 4   Comune_cat          object 
 5   Sez                 object 
 6   Comune_amm          object 
 7   Comune_descrizione  object 
 8   Fascia              object 
 9   Zona                object 
 10  LinkZona            object 
 11  Cod_Tip             int64  
 12  Descr_Tipologia     object 
 13  Stato               object 
 14  Stato_prev          object 
 15  Compr_min           float64
 16  Compr_max           float64
 17  Sup_NL_compr        object 
 18  Loc_min             float64
 19  Loc_max             float64
 20  Sup_NL_loc          object 
 21  Unnamed: 21         float64
 22  listing_id          object 
 23  semester            object 
dtypes: float64(6), int64(1),

Translating

In [6]:
column_renames = {
    "Area_territoriale": "area",
    "Regione": "region",
    "Prov": "prov",
    "Comune_ISTAT": "mun_istat",
    "Comune_cat": "mun_cat",
    "Comune_amm": "mun_cad",
    "Comune_descrizione": "mun_name",
    "Fascia": "sector",
    "Zona": "zone",
    "LinkZona": "zone_link",
    "Descr_Tipologia": "type",
    "Stato": "condition",
    "Compr_min": "buy_min",
    "Compr_max": "buy_max",
    "Loc_min": "lease_min",
    "Loc_max": "lease_max"
}

long_df = long_df.rename(columns=column_renames)

# Apply value mappings
long_df["area"] = long_df["area"].replace({
    "NORD-OVEST": "NW",
    "NORD-EST": "NE",
    "CENTRO": "C",
    "ISOLE": "I",
    "SUD": "S"
})

long_df["type"] = long_df["type"].replace({
    "Abitazioni civili": "residential housing",
    "Box": "garage",
    "Ville e Villini": "independent houses and villas",
    "Negozi": "shops",
    "Abitazioni di tipo economico": "lowcost housing",
    "Magazzini": "warehouses",
    "Uffici": "offices",
    "Laboratori": "laboratories",
    "Capannoni tipici": "typical industrial buildings",
    "Capannoni industriali": "industrial buildings",
    "Autorimesse": "garages",
    "Posti auto scoperti": "uncovered parking spaces",
    "Posti auto coperti": "covered parking spaces",
    "Centri commerciali": "shopping centers",
    "Uffici strutturati": "structured offices",
    "Abitazioni tipiche dei luoghi": "typical local housing",
    "Abitazioni signorili": "luxury housing",
    "Pensioni e assimilati": "guesthouses and similar",
    "Fabbricati e locali per esercizi sportivi": "sports facilities"
})

long_df["condition"] = long_df["condition"].replace({
    "NORMALE": "normal",
    "OTTIMO": "excellent",
    "SCADENTE": "poor"
})

long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7201300 entries, 0 to 7201299
Data columns (total 24 columns):
 #   Column        Dtype  
---  ------        -----  
 0   area          object 
 1   region        object 
 2   prov          object 
 3   mun_istat     float64
 4   mun_cat       object 
 5   Sez           object 
 6   mun_cad       object 
 7   mun_name      object 
 8   sector        object 
 9   zone          object 
 10  zone_link     object 
 11  Cod_Tip       int64  
 12  type          object 
 13  condition     object 
 14  Stato_prev    object 
 15  buy_min       float64
 16  buy_max       float64
 17  Sup_NL_compr  object 
 18  lease_min     float64
 19  lease_max     float64
 20  Sup_NL_loc    object 
 21  Unnamed: 21   float64
 22  listing_id    object 
 23  semester      object 
dtypes: float64(6), int64(1), object(17)
memory usage: 1.3+ GB


Deleting useless columns

In [7]:
columns = ['Sez', 'Stato_prev', 'Sup_NL_compr', 'Sup_NL_loc', 'Unnamed: 21']

long_df = long_df.drop(columns=columns)

Ordering variables

In [12]:
new_order = [
    "listing_id", "semester", "zone_link", "area", "region", "prov", "mun_istat", "mun_cat", "mun_cad", 
    "mun_name", "sector", "zone", "type", "condition", "buy_min", "buy_max", "lease_min", "lease_max"
]

long_df = long_df[new_order]

long_df.rename(columns={'area': 'location'}, inplace=True)

long_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7200542 entries, 0 to 7201299
Data columns (total 18 columns):
 #   Column      Dtype  
---  ------      -----  
 0   listing_id  object 
 1   semester    object 
 2   zone_link   object 
 3   location    object 
 4   region      object 
 5   prov        object 
 6   mun_istat   float64
 7   mun_cat     object 
 8   mun_cad     object 
 9   mun_name    object 
 10  sector      object 
 11  zone        object 
 12  type        object 
 13  condition   object 
 14  buy_min     float64
 15  buy_max     float64
 16  lease_min   float64
 17  lease_max   float64
dtypes: float64(5), object(13)
memory usage: 1.0+ GB


Checking IDs

In [9]:
# Number of unique listings
print("Number of unique listings:", long_df['listing_id'].nunique())

# Number of duplicate listings for the same semester
duplicates = long_df.duplicated(subset=['listing_id', 'semester'], keep=False)
print("Number of duplicate listings for the same semester:", duplicates.sum())

# Delete duplicate listings for the same semester, keeping the first occurrence
long_df = long_df.drop_duplicates(subset=['listing_id', 'semester'], keep='first')

print('Number of rows after removing duplicates:', len(long_df))

Number of unique listings: 522240
Number of duplicate listings for the same semester: 1430
Number of rows after removing duplicates: 7200542


In [None]:
long_df.to_csv('C:/Users/HP/Desktop/Traineeship/Code/datasets/italian_real_estate_data.csv', sep=',', index=False, encoding='utf-8-sig')