In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import os

# --- Configuration ---
DATA_FILE = 'pune_parking_hyper_realistic.csv' 
MODEL_FILE = 'parking_model_v2.json'
MODEL_DATA_FILE = 'parking_model_data_v2.joblib' 

def preprocess_and_train():
    
    # --- Step 1: Load Data ---
    print(f"Loading data from {DATA_FILE}...")
    df = pd.read_csv(DATA_FILE)

    # --- Step 2: Feature Engineering & Preprocessing ---
    print("Engineering features (cyclical time, NaNs)...")
    
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7.0)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7.0)

    feature_cols = [
        'slot_type', 'weather', 'event_type',
        'poi_office_count', 'poi_restaurant_count', 'poi_store_count',
        'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos'
    ]
    
    target_col = 'is_occupied'
    X = df[feature_cols].copy() 
    y = df[target_col]

    print("Encoding categorical features...")
    categorical_cols = ['slot_type', 'weather', 'event_type']
    encoders = {} 
    
    for col in categorical_cols:
        X[col] = X[col].fillna('missing')
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

    numerical_cols = ['poi_office_count', 'poi_restaurant_count', 'poi_store_count']
    for col in numerical_cols:
        X[col] = X[col].fillna(0) 

    # --- Step 3: Save Encoders and Column List ---
    model_data = {
        'encoders': encoders,
        'model_columns': X.columns.tolist()
    }
    joblib.dump(model_data, MODEL_DATA_FILE)
    print(f"Model data (encoders, columns) saved to {MODEL_DATA_FILE}")

    # --- Step 4: Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # --- Step 5: Train the XGBoost Model (Tuned) ---
    print("Training XGBoost model...")
    
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=8,
        early_stopping_rounds=20,
        enable_categorical=True,
        # --- ðŸš¨ FIX 1: More aggressive re-balancing ---
        scale_pos_weight=2.0 
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    print("Model training complete.")

    # --- Step 6: Evaluate the Model ---
    print("\n--- Model Evaluation ---")
    y_pred = model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Free (0)', 'Occupied (1)']))

    # --- Step 7: Save the Trained Model ---
    model.save_model(MODEL_FILE)
    print(f"\nModel saved to {MODEL_FILE}")


def predict_availability(sample_data):
    """Loads the saved model and data to predict availability."""
    print("\n--- New Prediction ---")
    
    if not os.path.exists(MODEL_FILE) or not os.path.exists(MODEL_DATA_FILE):
        print("Error: Model or model data not found. Please run the training first.")
        return

    model = xgb.XGBClassifier()
    model.load_model(MODEL_FILE)
    model_data = joblib.load(MODEL_DATA_FILE)
    encoders = model_data['encoders']
    model_columns = model_data['model_columns']
    
    print(f"Predicting for: {sample_data}")

    df_sample = pd.DataFrame([sample_data])

    if 'hour' in df_sample.columns:
        df_sample['hour_sin'] = np.sin(2 * np.pi * df_sample['hour'] / 24.0)
        df_sample['hour_cos'] = np.cos(2 * np.pi * df_sample['hour'] / 24.0)
    if 'weekday' in df_sample.columns:
        df_sample['weekday_sin'] = np.sin(2 * np.pi * df_sample['weekday'] / 7.0)
        df_sample['weekday_cos'] = np.cos(2 * np.pi * df_sample['weekday'] / 7.0)

    try:
        for col, le in encoders.items():
            if col in df_sample.columns:
                df_sample[col] = df_sample[col].fillna('missing')
                df_sample[col] = le.transform(df_sample[col])
    except Exception as e:
        print(f"Error transforming categorical data: {e} (Maybe an unknown label?)")
        return

    numerical_cols = ['poi_office_count', 'poi_restaurant_count', 'poi_store_count']
    for col in numerical_cols:
        if col in df_sample.columns:
            df_sample[col] = df_sample[col].fillna(0)
            
    df_pred = df_sample.reindex(columns=model_columns, fill_value=0)

    probabilities = model.predict_proba(df_pred)
    
    prob_free = probabilities[0][0]
    prob_occupied = probabilities[0][1]
    
    print(f"  -> Probability FREE:     {prob_free * 100:.2f}%")
    print(f"  -> Probability OCCUPIED: {prob_occupied * 100:.2f}%")
    
    return prob_occupied

# --- Main execution ---
if __name__ == "__main__":
    
    preprocess_and_train()
    
    # --- ðŸš¨ FIX 2: Sample data now EXACTLY matches simulation data ---
    
    sample_1 = {
        'slot_type': 'car', 'hour': 9, 'weekday': 1, 'weather': 'sunny',
        'event_type': 'none', 'poi_office_count': 30, 
        'poi_restaurant_count': 5, 'poi_store_count': 2
    }
    predict_availability(sample_1)

    sample_2 = {
        'slot_type': 'bike', 'hour': 22, 'weekday': 5, 'weather': 'sunny',
        'event_type': 'none', 'poi_office_count': 1, 
        'poi_restaurant_count': 2, 'poi_store_count': 2
    }
    predict_availability(sample_2)

    sample_3 = {
        'slot_type': 'car', 'hour': 18, 'weekday': 6, 'weather': 'rainy',
        'event_type': 'public_holiday', 'poi_office_count': 5,
        'poi_restaurant_count': 30, 'poi_store_count': 25
    }
    predict_availability(sample_3)

Loading data from pune_parking_hyper_realistic.csv...
Engineering features (cyclical time, NaNs)...
Encoding categorical features...
Model data (encoders, columns) saved to parking_model_data_v2.joblib
Training XGBoost model...
Model training complete.

--- Model Evaluation ---
Accuracy: 0.7730

Classification Report:
              precision    recall  f1-score   support

    Free (0)       0.80      0.80      0.80    101644
Occupied (1)       0.74      0.74      0.74     76916

    accuracy                           0.77    178560
   macro avg       0.77      0.77      0.77    178560
weighted avg       0.77      0.77      0.77    178560


Model saved to parking_model_v2.json

--- New Prediction ---
Predicting for: {'slot_type': 'car', 'hour': 9, 'weekday': 1, 'weather': 'sunny', 'event_type': 'none', 'poi_office_count': 30, 'poi_restaurant_count': 5, 'poi_store_count': 2}
  -> Probability FREE:     0.05%
  -> Probability OCCUPIED: 99.95%

--- New Prediction ---
Predicting for: {'slot_