In [3]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

ROOT = Path('..')
DATA = ROOT / 'data' / 'listings.csv'
OUT  = ROOT / 'results'
OUT.mkdir(parents=True, exist_ok=True)

RNG = 42
FIG_DPI = 300

df = pd.read_csv(DATA, low_memory=False)

def clean_price(s: pd.Series) -> pd.Series:
    """Turn $1,234.00-like strings into floats; keep NaN."""
    return (s.astype(str)
             .str.replace(r'[\$,]', '', regex=True)
             .str.replace(r'\.00$', '', regex=True)
             .replace('nan', np.nan)
             .astype(float))

if 'price' in df.columns:
    df['price'] = clean_price(df['price'])
else:
    raise ValueError("Column 'price' not found in listings.csv")

target = 'price'

base_cols = [c for c in [
    'room_type',
    'neighbourhood_cleansed',
    'latitude', 'longitude',
    'minimum_nights',
    'number_of_reviews', 'reviews_per_month',
    'availability_365',
    'calculated_host_listings_count'
] if c in df.columns]

work = df[[target] + base_cols].copy()

def haversine_km(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat  = lat2 - lat1
    dlon  = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371.0 * c

if {'latitude', 'longitude'} <= set(work.columns):
    lat0, lon0 = 42.3601, -71.0589
    work['dist_to_center_km'] = haversine_km(work['latitude'], work['longitude'], lat0, lon0)
    engineered = ["dist_to_center_km (Haversine distance to Boston center)"]
else:
    work['dist_to_center_km'] = np.nan
    engineered = []

use_cols = base_cols + (['dist_to_center_km'] if 'dist_to_center_km' in work.columns else [])

before_shape = tuple(work.shape)
work = work[work[target].notna()].copy()
after_drop_target_only = tuple(work.shape)

X_all = work[use_cols]
y_all = work[target]

idx_all = np.arange(len(X_all))
idx_train, idx_temp, y_train, y_temp = train_test_split(
    idx_all, y_all.values, test_size=0.40, random_state=RNG
)
idx_val, idx_test, y_val, y_test   = train_test_split(
    idx_temp, y_temp, test_size=0.50, random_state=RNG
)

X_train = X_all.iloc[idx_train]
X_val   = X_all.iloc[idx_val]
X_test  = X_all.iloc[idx_test]

num_cols = X_all.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_all.select_dtypes(exclude=np.number).columns.tolist()

print(f'Use cols: {use_cols}')
print(f'Num: {len(num_cols)}  Cat: {len(cat_cols)}')
print('Shape before (all rows):', before_shape)
print('After drop target-only :', after_drop_target_only)
print('Train/Val/Test (raw X):', X_train.shape, X_val.shape, X_test.shape)

num_pipe = Pipeline([
    ('imputer', IterativeImputer(
        random_state=RNG,
        max_iter=10,
        sample_posterior=False,
        initial_strategy='mean'
    )),
    ('scaler', StandardScaler())
])

cat_pipe = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

pre = ColumnTransformer([
    ('num',      num_pipe,   num_cols),
    ('num_miss', MissingIndicator(features='missing-only'), num_cols),
    ('cat',      cat_pipe,   cat_cols),
], remainder='drop')

pipe = Pipeline([('pre', pre)])

X_train_t = pipe.fit_transform(X_train)
X_val_t   = pipe.transform(X_val)
X_test_t  = pipe.transform(X_test)

feature_names: list[str] = []

feature_names.extend(num_cols)

miss_block = pipe.named_steps['pre'].named_transformers_['num_miss']
if hasattr(miss_block, 'features_'):
    miss_idx = list(miss_block.features_)
    miss_names = [f'{num_cols[i]}_missing' for i in miss_idx]
else:
    miss_names = [f'{c}_missing' for c in num_cols]
feature_names.extend(miss_names)

if cat_cols:
    ohe = pipe.named_steps['pre'].named_transformers_['cat']
    for i, cats in enumerate(ohe.categories_):
        col = cat_cols[i]
        for level in cats:
            level_str = 'NaN' if (isinstance(level, float) and np.isnan(level)) else str(level)
            feature_names.append(f'{col}={level_str}')

print('After (features):', X_train_t.shape, X_val_t.shape, X_test_t.shape)

np.save(OUT/'X_train.npy', X_train_t)
np.save(OUT/'X_val.npy',   X_val_t)
np.save(OUT/'X_test.npy',  X_test_t)

np.save(OUT/'y_train.npy', y_train)
np.save(OUT/'y_val.npy',   y_val)
np.save(OUT/'y_test.npy',  y_test)

with open(OUT/'feature_names.json', 'w', encoding='utf-8') as f:
    json.dump(feature_names, f, ensure_ascii=False, indent=2)

meta = {
    "use_cols": use_cols,
    "engineered": engineered,
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "imputation": {
        "numeric": "IterativeImputer(MICE, max_iter=10, initial_strategy='mean') + MissingIndicator(features='missing-only')",
        "categorical": "OneHotEncoder(handle_unknown='ignore'); NaN kept as its own level"
    },
    "scaling": "StandardScaler on numeric after imputation",
    "split_random_state": RNG,
    "before_shape": before_shape,
    "after_drop_target_only": after_drop_target_only,
    "train_shape": tuple(X_train_t.shape),
    "val_shape": tuple(X_val_t.shape),
    "test_shape": tuple(X_test_t.shape),
}
with open(OUT/'preprocess_meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

if miss_names:
    print('Numeric columns with missing (train):', miss_names)
if cat_cols:
    ohe_sizes = [len(c) for c in pipe.named_steps['pre'].named_transformers_['cat'].categories_]
    print('One-Hot sizes:', list(zip(cat_cols, ohe_sizes)))
print('Saved to:', OUT)

Use cols: ['room_type', 'neighbourhood_cleansed', 'latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count', 'dist_to_center_km']
Num: 8  Cat: 2
Shape before (all rows): (3585, 11)
After drop target-only : (3585, 11)
Train/Val/Test (raw X): (2151, 10) (717, 10) (717, 10)
After (features): (2151, 37) (717, 37) (717, 37)
Numeric columns with missing (train): ['reviews_per_month_missing']
One-Hot sizes: [('room_type', 3), ('neighbourhood_cleansed', 25)]
Saved to: ..\results
