### Setup and imports

In [10]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

In [18]:
random.seed(42)
np.random.seed(42)

# Configurazioni centralizzate
CONFIG = {
    'PRICE_PER_SQM_MIN': 1200,
    'PRICE_PER_SQM_MAX': 3500,
    'GARDEN_BONUS': 10000,
    'BALCONY_BONUS': 5000,
    'GARAGE_BONUS': 7000,
    'ELEVATOR_MIN_FLOORS': 4
}

### Base data

In [12]:
N_ROWS = 150
ASSET_TYPE = "property"
TODAY = datetime.utcnow()

# Static vocabularies
LOCATIONS = ['Milan', 'Rome', 'Naples', 'Florence', 'Turin', 'Bologna', 'Palermo', 'Genoa']
ENERGY_CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

### Support functions

In [13]:
def simulate_condition_score(humidity: float, temperature: float, energy_class: str) -> float:
    """
    Produce a synthetic condition score in [0,1] based on a few signals.
    """
    score = 0.85
    # Humidity penalty
    if humidity > 65:
        score -= 0.15
    elif humidity > 55:
        score -= 0.05
    # Temperature penalty (outside mild comfort band)
    if temperature < 14 or temperature > 24:
        score -= 0.07
    # Energy class adjustment
    class_adjust = {
        'A': +0.03, 'B': +0.02, 'C': 0.00,
        'D': -0.02, 'E': -0.04, 'F': -0.06, 'G': -0.10
    }
    score += class_adjust.get(energy_class, 0.0)
    # Add small noise
    score += np.random.normal(0, 0.02)
    return round(min(1.0, max(0.0, score)), 3)

def random_recent_timestamp(days_back: int = 60) -> str:
    """
    Generate an ISO timestamp (Z) within the last `days_back` days.
    """
    delta_days = random.randint(0, days_back)
    dt = TODAY - timedelta(days=delta_days,
                           hours=random.randint(0, 23),
                           minutes=random.randint(0, 59))
    return dt.isoformat(timespec='seconds') + "Z"

### Generate data rows function

In [14]:
def generate_property(index: int) -> dict:
    size_m2 = np.random.randint(40, 200)
    rooms = np.random.randint(2, 7)
    bathrooms = np.random.randint(1, 4)
    year_built = np.random.randint(1950, 2023)

    floor = np.random.randint(0, 5)
    building_floors = np.random.randint(floor + 1, 10)  # ensure floor < building_floors

    has_elevator = int(building_floors >= 4)
    has_garden = int(random.random() < 0.30)
    has_balcony = int(random.random() < 0.60)
    garage = int(random.random() < 0.50)

    energy_class = random.choice(ENERGY_CLASSES)
    humidity = round(np.random.uniform(30, 70), 1)
    temperature = round(np.random.uniform(12, 25), 1)
    noise_level = int(np.random.randint(20, 80))
    air_quality_index = int(np.random.randint(30, 150))
    location = random.choice(LOCATIONS)

    current_year = datetime.utcnow().year
    age_years = current_year - year_built

    # Synthetic valuation (in thousands)
    base_price_eur = size_m2 * np.random.uniform(1200, 3500)
    if energy_class in ['A', 'B']:
        base_price_eur *= 1.05
    if has_garden:
        base_price_eur += 10_000
    if has_balcony:
        base_price_eur += 5_000
    if garage:
        base_price_eur += 7_000

    valuation_k = round(base_price_eur / 1000, 2)

    condition_score = simulate_condition_score(humidity, temperature, energy_class)
    risk_score = round(min(1.0, max(0.0, (1 - condition_score) + np.random.normal(0, 0.02))), 3)

    return {
        "asset_id": f"asset_{index:04}",
        "asset_type": ASSET_TYPE,
        "location": location,
        "size_m2": size_m2,
        "rooms": rooms,
        "bathrooms": bathrooms,
        "year_built": year_built,
        "age_years": age_years,
        "floor": floor,
        "building_floors": building_floors,
        "has_elevator": has_elevator,
        "has_garden": has_garden,
        "has_balcony": has_balcony,
        "garage": garage,
        "energy_class": energy_class,
        "humidity_level": humidity,
        "temperature_avg": temperature,
        "noise_level": noise_level,
        "air_quality_index": air_quality_index,
        "valuation_k": valuation_k,
        "condition_score": condition_score,
        "risk_score": risk_score,
        "last_verified_ts": random_recent_timestamp()
    }

In [24]:
# Factory multi-RWA
def generate_asset(asset_type, index):
    """Asset factory - ready for multi-RWA"""
    if asset_type == "property":
        return generate_property(index)
    # Future support:
    # elif asset_type == "art": return generate_art(index)
    else:
        raise ValueError(f"Unsupported asset_type: {asset_type}")

### Data Validation

In [19]:
# Validazione record singolo
def validate_property(prop_data):
    """Validate generated property data"""
    assert prop_data['floor'] <= prop_data['building_floors']
    assert 0 <= prop_data['condition_score'] <= 1
    assert 0 <= prop_data['risk_score'] <= 1
    assert prop_data['valuation_k'] > 0
    assert prop_data['size_m2'] > 0
    return prop_data

### Generate DataFrame

In [25]:
data = [validate_property(generate_asset("property", i)) for i in range(N_ROWS)]

# Validazione schema dataframe
REQUIRED_FIELDS = [
    'asset_id', 'asset_type', 'location', 'size_m2', 'rooms', 
    'bathrooms', 'year_built', 'age_years', 'floor', 'building_floors',
    'has_elevator', 'has_garden', 'has_balcony', 'garage', 'energy_class',
    'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index',
    'valuation_k', 'condition_score', 'risk_score', 'last_verified_ts'
]

def validate_schema(df):
    """Ensure all required fields are present"""
    missing = set(REQUIRED_FIELDS) - set(df.columns)
    assert not missing, f"Missing required fields: {missing}"
    print(f"✅ Schema validation passed - all {len(REQUIRED_FIELDS)} fields present")

validate_schema(df)

# Riordino colonne
preferred_order = REQUIRED_FIELDS
df = df[preferred_order]

df.head()

✅ Schema validation passed - all 23 fields present


Unnamed: 0,asset_id,asset_type,location,size_m2,rooms,bathrooms,year_built,age_years,floor,building_floors,...,garage,energy_class,humidity_level,temperature_avg,noise_level,air_quality_index,valuation_k,condition_score,risk_score,last_verified_ts
0,asset_0000,property,Naples,142,5,1,1964,61,2,7,...,1,B,53.9,17.8,42,104,348.41,0.852,0.14,2025-06-04T14:31:13Z
1,asset_0001,property,Milan,170,6,2,1979,46,1,9,...,0,A,69.7,20.0,77,51,222.1,0.73,0.261,2025-07-16T12:00:13Z
2,asset_0002,property,Palermo,54,4,3,2013,12,0,3,...,1,F,64.4,20.8,28,68,78.45,0.742,0.271,2025-07-07T03:37:13Z
3,asset_0003,property,Palermo,48,3,1,1951,74,3,7,...,0,B,47.6,13.6,27,76,90.58,0.776,0.216,2025-06-30T10:05:13Z
4,asset_0004,property,Rome,171,3,2,1955,70,1,5,...,1,D,37.4,24.6,45,73,591.7,0.764,0.254,2025-06-29T06:36:13Z


### Export CSV

In [30]:
out_path = "../data/property_dataset_v1.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path, "rows:", len(df), "cols:", len(df.columns))

df.describe(include='all').T.head(20)

Saved: ../data/property_dataset_v1.csv rows: 150 cols: 23


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
asset_id,150.0,150.0,asset_0000,1.0,,,,,,,
asset_type,150.0,1.0,property,150.0,,,,,,,
location,150.0,8.0,Florence,30.0,,,,,,,
size_m2,150.0,,,,121.746667,47.664964,40.0,78.25,122.0,165.25,199.0
rooms,150.0,,,,3.993333,1.440063,2.0,3.0,4.0,5.0,6.0
bathrooms,150.0,,,,1.9,0.841459,1.0,1.0,2.0,3.0,3.0
year_built,150.0,,,,1983.946667,21.456929,1950.0,1965.0,1983.5,2002.75,2021.0
age_years,150.0,,,,41.053333,21.456929,4.0,22.25,41.5,60.0,75.0
floor,150.0,,,,1.806667,1.422243,0.0,0.25,2.0,3.0,4.0
building_floors,150.0,,,,6.013333,2.179793,1.0,5.0,6.0,8.0,9.0
