### Setup and imports

In [4]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

In [5]:
random.seed(42)
np.random.seed(42)

### Base data

In [14]:
# Example values
locations = ['Milan', 'Rome', 'Naples', 'Florence', 'Turin', 'Bologna', 'Palermo', 'Genoa']
energy_classes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

N_ROWS = 150
ASSET_TYPE = "property"
TODAY = datetime.utcnow()

### Support functions

In [15]:
def simulate_condition_score(humidity, temperature, energy_class):
    # Base score
    score = 0.85
    # Penalizza umidità alta
    if humidity > 65: score -= 0.15
    elif humidity > 55: score -= 0.05
    # Penalizza temperature estreme
    if temperature < 14 or temperature > 24: score -= 0.07
    # Classe energetica influisce
    class_penalty = {'A': +0.03, 'B': +0.02, 'C': 0.0, 'D': -0.02, 'E': -0.04, 'F': -0.06, 'G': -0.10}
    score += class_penalty.get(energy_class, 0.0)
    # Clip tra 0 e 1
    return round(max(0.0, min(1.0, score + np.random.normal(0, 0.02))), 3)

def random_recent_timestamp(days_back=60):
    delta = random.randint(0, days_back)
    dt = TODAY - timedelta(days=delta, hours=random.randint(0,23), minutes=random.randint(0,59))
    return dt.isoformat(timespec='seconds') + "Z"

### Generate data rows function

In [16]:
def generate_property(index):
    size = np.random.randint(40, 200)
    rooms = np.random.randint(2, 7)
    bathrooms = np.random.randint(1, 4)
    year_built = np.random.randint(1950, 2023)
    floor = np.random.randint(0, 5)
    building_floors = np.random.randint(floor + 1, 10)

    has_elevator = int(building_floors >= 4)
    has_garden = int(random.random() < 0.3)
    has_balcony = int(random.random() < 0.6)
    garage = int(random.random() < 0.5)

    energy_class = random.choice(energy_classes)
    humidity = round(np.random.uniform(30, 70), 1)
    temperature = round(np.random.uniform(12, 25), 1)
    noise = np.random.randint(20, 80)
    aqi = np.random.randint(30, 150)
    location = random.choice(locations)

    current_year = datetime.utcnow().year
    if "year_build" in df.columns and "year_built" not in df.columns:
        df = df.rename(columns={"year_build": "year_built"})

    if "age_years" not in df.columns:
        df["age_years"] = current_year - df["year_built"]

    age_years = current_year - year_built

    # Valore base in migliaia (eur/m² * m² / 1000)
    base_price = size * np.random.uniform(1200, 3500)  # euro
    if energy_class in ['A', 'B']:
        base_price *= 1.05
    if has_garden:
        base_price += 10000
    if has_balcony:
        base_price += 5000
    if garage:
        base_price += 7000

    valuation_k = round(base_price / 1000, 2)

    condition_score = simulate_condition_score(humidity, temperature, energy_class)
    # risk_score = 1 - condition (con rumore piccolo)
    risk_score = round(max(0.0, min(1.0, (1 - condition_score) + np.random.normal(0, 0.02))), 3)

    return {
        "asset_id": f"asset_{index:04}",
        "asset_type": ASSET_TYPE,
        "location": location,
        "size_m2": size,
        "rooms": rooms,
        "bathrooms": bathrooms,
        "year_built": year_built,
        "age_years": age_years,
        "floor": floor,
        "building_floors": building_floors,
        "has_elevator": has_elevator,
        "has_garden": has_garden,
        "has_balcony": has_balcony,
        "garage": garage,
        "energy_class": energy_class,
        "humidity_level": humidity,
        "temperature_avg": temperature,
        "noise_level": noise,
        "air_quality_index": aqi,
        "valuation_k": valuation_k,
        "condition_score": condition_score,
        "risk_score": risk_score,
        "last_verified_ts": random_recent_timestamp()
    }

### Generate DataFrame

In [17]:
preferred_order = [
    "asset_id","asset_type","location","size_m2","rooms","bathrooms",
    "year_built","age_years","floor","building_floors",
    "has_elevator","has_garden","has_balcony","garage",
    "energy_class","humidity_level","temperature_avg",
    "noise_level","air_quality_index","valuation_k",
    "condition_score","risk_score","last_verified_ts"
]

df = df[[c for c in preferred_order if c in df.columns]]
data = [generate_property(i) for i in range(N_ROWS)]
df = pd.DataFrame(data)
df.head()

Unnamed: 0,asset_id,asset_type,location,size_m2,rooms,bathrooms,year_built,floor,building_floors,has_elevator,...,garage,energy_class,humidity_level,temperature_avg,noise_level,air_quality_index,valuation_k,condition_score,risk_score,last_verified_ts
0,asset_0000,property,Palermo,170,6,2,1979,1,9,1,...,0,A,69.7,20.0,77,51,217.1,0.73,0.261,2025-07-17T15:32:04Z
1,asset_0001,property,Palermo,54,4,3,2013,0,3,0,...,1,B,64.4,20.8,28,68,91.77,0.822,0.191,2025-07-05T01:00:04Z
2,asset_0002,property,Palermo,48,3,1,1951,3,7,1,...,0,B,47.6,13.6,27,76,90.58,0.776,0.216,2025-06-28T07:28:04Z
3,asset_0003,property,Rome,171,3,2,1955,1,5,1,...,1,D,37.4,24.6,45,73,591.7,0.764,0.254,2025-06-27T03:59:04Z
4,asset_0004,property,Palermo,87,5,1,2011,4,9,1,...,1,A,40.9,22.8,45,118,190.65,0.857,0.146,2025-07-13T22:19:04Z


### Export CSV

In [18]:
output_path = "../data/property_dataset_mvp_multi_rwa.csv"
df.to_csv(output_path, index=False)
print(f"Dataset saved to: {output_path}")
df.describe(include='all').T.head(20)

Dataset saved to: ../data/property_dataset_mvp_multi_rwa.csv


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
asset_id,150.0,150.0,asset_0000,1.0,,,,,,,
asset_type,150.0,1.0,property,150.0,,,,,,,
location,150.0,8.0,Florence,30.0,,,,,,,
size_m2,150.0,,,,121.5,47.655186,40.0,78.25,120.5,165.25,199.0
rooms,150.0,,,,3.973333,1.446805,2.0,3.0,4.0,5.0,6.0
bathrooms,150.0,,,,1.9,0.841459,1.0,1.0,2.0,3.0,3.0
year_built,150.0,,,,1983.86,21.564024,1950.0,1965.0,1983.5,2002.75,2021.0
floor,150.0,,,,1.806667,1.422243,0.0,0.25,2.0,3.0,4.0
building_floors,150.0,,,,5.986667,2.192074,1.0,5.0,6.0,8.0,9.0
has_elevator,150.0,,,,0.84,0.367834,0.0,1.0,1.0,1.0,1.0
