### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
def preprocess_miami_mls(filepath: str) -> pd.DataFrame:
    print(f"Starting preprocessing for: {filepath}")
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded data. Initial shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return pd.DataFrame()

#initial cleaning
    if 'PropertyType' in df.columns: df = df[df['PropertyType'] != 'ResidentialLease']
    if 'ClosePrice' in df.columns: df = df[df['ClosePrice'].between(10_000, 50_000_000)]
    
    #grouping similar named property types
    if 'PropertySubType' in df.columns:
        print("Consolidating PropertySubType categories...")
        
    #categories grouping
        condo_group = ['Condominium', 'Townhouse', 'Apartment', 'Villa', 'StockCooperative']
        sfr_group = ['SingleFamilyResidence', 'MobileHome']
        multi_family_group = ['MultiFamily', 'Duplex', 'Residential']
        
    #removing unwanted categories
        remove_group = ['Timeshare', 'HotelMotel', 'BoatSlip', 'Other', 'Office', 'Industrial']
        
    # Remove rows with unwanted PropertySubType
        df = df[~df['PropertySubType'].isin(remove_group)]
        
        # Creating a mapping dictionary
        mapping = {sub_type: 'Condominium' for sub_type in condo_group}
        mapping.update({sub_type: 'Single Family Residence' for sub_type in sfr_group})
        mapping.update({sub_type: 'Multi Family' for sub_type in multi_family_group})
        
        # Map the PropertySubType to new categories
        df['PropertyCategory'] = df['PropertySubType'].map(mapping)
        
        # Drop rows where the category is now NaN (i.e., not in our mapping)
        df.dropna(subset=['PropertyCategory'], inplace=True)

    # Clean and Standardize ZIP Code Column
    if 'ZIP' in df.columns:
        df['ZIP'] = df['ZIP'].astype(str).fillna('').str.replace(r'[^0-9]', '', regex=True).str.zfill(5).str.slice(0, 5)

    # Condo-Specific Logic
    if 'PropertyCategory' in df.columns:
        print("Applying logic for attached properties (condos, townhouses)...")
        df['IsAttached'] = np.where(df['PropertyCategory'] == 'Condominium', 1, 0)
        if 'PropertyLot_Square_footage' in df.columns:
            df.loc[df['PropertyCategory'] == 'Condominium', 'PropertyLot_Square_footage'] = 0

# Handling Missing Values & Creating New Features
    if 'GarageYN' in df.columns and 'GarageSpaces' in df.columns:
        condition = (df['GarageYN'] == True) & (df['GarageSpaces'].isnull())
        df.loc[condition, 'GarageSpaces'] = 1
        df = df.drop(columns=['GarageYN'])
    if 'YearBuilt' in df.columns:
        df['PropertyAge'] = 2025 - df['YearBuilt']
        df = df.drop(columns=['YearBuilt'])
    
#description based features extractions
    if 'Description' in df.columns:
        print("Extracting new features from description...")
        df['Description'] = df['Description'].astype(str).fillna('').str.lower()
        pool_keywords = ['pool', 'swimming', 'poolside','Private Pool', 'in-ground pool', 'heated pool', 'pool area', 'pool deck', 'spa', 'jacuzzi', 'hot tub']
        df['HasPrivatePool'] = df['Description'].str.contains('|'.join(pool_keywords), case=False).astype(int)
        remodel_keywords = ['remodeled', 'renovated', 'updated', 'newly done', 'fully upgraded', 'modernized', 'recently renovated', 'new finishes', 'newly remodeled', 
                            'newly renovated','New','New Construction', 'Newly Built']
        df['IsRemodeled'] = df['Description'].str.contains('|'.join(remodel_keywords), case=False).astype(int)
        roof_keywords = ['new roof', 'roof replaced', 'recent roof']
        df['HasNewRoof'] = df['Description'].str.contains('|'.join(roof_keywords), case=False).astype(int)
        kitchen_keywords = ['granite', 'quartz', 'stainless steel', 'new kitchen', 'updated kitchen', 'gourmet kitchen','Chef\'s kitchen', 'modern kitchen', 'luxury kitchen', 
                            'kitchen remodel', 'kitchen renovation', 'kitchen upgrade']
        df['HasUpgradedKitchen'] = df['Description'].str.contains('|'.join(kitchen_keywords), case=False).astype(int)
    
    print(f"Preprocessing complete. Final data shape: {df.shape}")
    return df

def engineer_features(X_train, y_train, X_test):
    """
    Engineers advanced features for ZIP and City based on the training data.
    """
    print("\nStarting Advanced Feature Engineering...")
    for df in [X_train, X_test]:
        df['BathBedRatio'] = (df['Baths'] / df['Beds']).replace([np.inf, -np.inf], 0).fillna(0)
        df['HouseLotRatio'] = (df['SquareFootage'] / df['PropertyLot_Square_footage']).replace([np.inf, -np.inf], 0).fillna(0)
    
    train_with_target = X_train.join(y_train)
    
 #zip features
    if 'ZIP' in X_train.columns:
        zip_stats = X_train.groupby('ZIP').agg({'SquareFootage': ['mean', 'median'], 'PropertyAge': ['mean', 'median']})
        zip_stats.columns = ['ZIP_' + '_'.join(col).strip() for col in zip_stats.columns.values]
        X_train = X_train.merge(zip_stats, on='ZIP', how='left')
        X_test = X_test.merge(zip_stats, on='ZIP', how='left')
        for col in zip_stats.columns: X_test[col] = X_test[col].fillna(zip_stats[col].mean())
        print("Created neighborhood features for ZIP code.")
    else:
        zip_stats = None

    #City Features
    if 'City' in X_train.columns:
        city_stats = train_with_target.groupby('City').agg({
            'ClosePrice': lambda x: x.median(),
            'SquareFootage': lambda x: x.median(),
            'Beds': lambda x: x.median()
        })
        city_stats['PricePerSqFt_by_City'] = city_stats['ClosePrice'] / city_stats['SquareFootage']
        city_stats = city_stats.drop(columns='ClosePrice')
        city_stats.columns = ['MedianSqFt_by_City', 'MedianBeds_by_City', 'PricePerSqFt_by_City']
        
        X_train = X_train.merge(city_stats, on='City', how='left')
        X_test = X_test.merge(city_stats, on='City', how='left')
        for col in city_stats.columns: X_test[col] = X_test[col].fillna(city_stats[col].mean())
        print("Created market features for City.")
    else:
        city_stats = None

    print("Feature Engineering complete.")
    return X_train, X_test, zip_stats, city_stats

#loading and preprocessing data
df = preprocess_miami_mls('miami_mls4.csv')

if not df.empty:
   #feature extraction and train-test split
    features = [
        'Beds', 'Baths', 'HalfBaths', 'SquareFootage', 'PropertyLot_Square_footage',
        'GarageSpaces', 'PropertyAge', 'IsLuxury', 'IsRemodeled',
        'HasPrivatePool', 'City', 'PropertyCategory', 'ZIP', # Using new PropertyCategory
        'IsAttached', 'HasNewRoof', 'HasUpgradedKitchen' # Using new IsAttached
    ]
    target = 'ClosePrice'
    available_features = [f for f in features if f in df.columns]
    X = df[available_features]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

   #feature engineering
    X_train, X_test, zip_stats_to_save, city_stats_to_save = engineer_features(X_train.copy(), y_train, X_test.copy())

#handle categorical variables
    for col in ['ZIP', 'City']:
        if col in X_train.columns: X_train = X_train.drop(columns=[col])
        if col in X_test.columns: X_test = X_test.drop(columns=[col])
    
    if 'PropertyCategory' in X_train.columns:
        X_train = pd.get_dummies(X_train, columns=['PropertyCategory'], drop_first=True)
        X_test = pd.get_dummies(X_test, columns=['PropertyCategory'], drop_first=True)
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

  #log transform target
    y_train_log = np.log1p(y_train)
#hyperparameter tuning
    print("\n--- Starting Hyperparameter Tuning with RandomizedSearchCV ---")
    param_grid = {
        'n_estimators': [500, 1000, 1500], 'learning_rate': [0.02, 0.05, 0.1],
        'max_depth': [5, 7, 9], 'subsample': [0.7, 0.8], 'colsample_bytree': [0.7, 0.8],
    }
    xgb_tuner = xgb.XGBRegressor(objective='reg:quantileerror', quantile_alpha=0.5, random_state=42, n_jobs=-1)
    random_search = RandomizedSearchCV(
        estimator=xgb_tuner, param_distributions=param_grid, n_iter=6,
        scoring='neg_mean_absolute_error', cv=3, verbose=1, random_state=42, n_jobs=-1
    )
    random_search.fit(X_train, y_train_log)
    best_params = random_search.best_params_
    print(f"Best parameters found: {best_params}")

    #train final models for quantiles
    quantiles = {'low': 0.10, 'mid': 0.50, 'high': 0.90}
    models = {}
    for name, q in quantiles.items():
        print(f"Training final XGBoost model for {name} quantile ({q:.2f})...")
        model = xgb.XGBRegressor(
            objective='reg:quantileerror', quantile_alpha=q, 
            random_state=42, n_jobs=-1, early_stopping_rounds=50, **best_params
        )
        model.fit(X_train, y_train_log, eval_set=[(X_test, np.log1p(y_test))], verbose=False)
        models[name] = model
        print(f"{name.capitalize()} model training complete. ✅")
#Pred and evaluate
    y_pred_median = np.expm1(models['mid'].predict(X_test))
    mae = mean_absolute_error(y_test, y_pred_median)
    r2 = r2_score(y_test, y_pred_median)
    print("\n--- Model Evaluation (based on Median Prediction) ---")
    print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
    print(f"R-squared ($R^2$): {r2:.4f}")

    # save artifacts
    print("\n--- Saving all necessary artifacts... ---")
    for name, model in models.items():
        model.save_model(f"xgb_model_{name}.json")
        print(f"Saved model to xgb_model_{name}.json")

    if zip_stats_to_save is not None:
        joblib.dump(zip_stats_to_save, 'zip_stats.joblib')
        print("Saved zip_stats lookup data to zip_stats.joblib")
    if city_stats_to_save is not None:
        joblib.dump(city_stats_to_save, 'city_stats.joblib')
        print("Saved city_stats lookup data to city_stats.joblib")

    model_columns = X_train.columns
    joblib.dump(model_columns, 'model_columns.joblib')
    print("Saved model columns to model_columns.joblib")
    print("\nArtifacts saved successfully. You are ready to run the app.")

else:
    print("\nCould not run model training because the DataFrame is empty.")