In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import os

# --- NEW: Imports for plotting ---
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# --- Configuration ---
DATA_FILE = 'pune_parking_hyper_realistic.csv'
MODEL_FILE = 'parking_model_v2.json'
MODEL_DATA_FILE = 'parking_model_data_v2.joblib'

def preprocess_and_train():
    
    # --- Step 1: Load Data ---
    print(f"Loading data from {DATA_FILE}...")
    # Try to load the data, handle file not found
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: Data file '{DATA_FILE}' not found.")
        print("Please make sure the CSV file is in the same directory as the script.")
        # Create a dummy CSV file to allow the script to run for demonstration
        print("Creating a dummy 'pune_parking_hyper_realistic.csv' to proceed...")
        dummy_data = {
            'slot_type': np.random.choice(['car', 'bike', 'ev'], 1000),
            'hour': np.random.randint(0, 24, 1000),
            'weekday': np.random.randint(0, 7, 1000),
            'weather': np.random.choice(['sunny', 'rainy', 'cloudy'], 1000),
            'event_type': np.random.choice(['none', 'festival', 'public_holiday'], 1000),
            'poi_office_count': np.random.randint(0, 30, 1000),
            'poi_restaurant_count': np.random.randint(0, 30, 1000),
            'poi_store_count': np.random.randint(0, 30, 1000),
            'is_occupied': np.random.randint(0, 2, 1000)
        }
        df = pd.DataFrame(dummy_data)
        df.to_csv(DATA_FILE, index=False)


    # --- Step 2: Feature Engineering & Preprocessing ---
    print("Engineering features (cyclical time, NaNs)...")
    
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7.0)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7.0)

    feature_cols = [
        'slot_type', 'weather', 'event_type',
        'poi_office_count', 'poi_restaurant_count', 'poi_store_count',
        'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos'
    ]
    
    target_col = 'is_occupied'
    
    # Ensure all feature_cols exist, add if missing (e.g., from dummy data)
    for col in feature_cols:
        if col not in df.columns:
             # Add a default column if it's missing from the dummy
             if 'poi' in col:
                 df[col] = np.random.randint(0, 30, len(df))
             else:
                 df[col] = 'missing' 

    X = df[feature_cols].copy() 
    y = df[target_col]

    print("Encoding categorical features...")
    categorical_cols = ['slot_type', 'weather', 'event_type']
    encoders = {} 
    
    for col in categorical_cols:
        X[col] = X[col].fillna('missing')
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

    numerical_cols = ['poi_office_count', 'poi_restaurant_count', 'poi_store_count']
    for col in numerical_cols:
        X[col] = X[col].fillna(0) 

    # --- Step 3: Save Encoders and Column List ---
    model_data = {
        'encoders': encoders,
        'model_columns': X.columns.tolist()
    }
    joblib.dump(model_data, MODEL_DATA_FILE)
    print(f"Model data (encoders, columns) saved to {MODEL_DATA_FILE}")

    # --- Step 4: Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # --- Step 5: Train the XGBoost Model (Tuned) ---
    print("Training XGBoost model...")
    
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=8,
        early_stopping_rounds=20,
        enable_categorical=True, 
        scale_pos_weight=2.0,
        importance_type='gain'
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    print("Model training complete.")

    # --- Step 6: Evaluate the Model ---
    print("\n--- Model Evaluation ---")
    y_pred = model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Free (0)', 'Occupied (1)']))

    # --- ----------------------------------- ---
    # --- NEW: Generate Minimalistic Graphs ---
    # --- ----------------------------------- ---
    print("Generating evaluation plots...")

    # 1. Confusion Matrix
    try:
        # Plot with a clean, minimalistic style
        fig, ax = plt.subplots(figsize=(6, 5))
        ConfusionMatrixDisplay.from_predictions(
            y_test, 
            y_pred, 
            display_labels=['Free (0)', 'Occupied (1)'],
            cmap='Blues',
            values_format='d', # 'd' for integer format
            ax=ax
        )
        ax.set_title('Model Performance (Confusion Matrix)')
        ax.grid(False) # Remove gridlines for a cleaner look
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')
        print("Saved confusion_matrix.png")
        plt.close(fig) # Close the figure
    except Exception as e:
        print(f"Error generating confusion matrix: {e}")

    # 2. Feature Importance
    try:
        # Get importances (as 'gain') and feature names
        importances = model.feature_importances_
        feature_names = model.feature_names_in_
        
        # Create a pandas Series for easy sorting and plotting
        f_imp = pd.Series(data=importances, index=feature_names)
        
        # Get top 10 and sort for horizontal bar chart
        # (sort ascending so the most important is at the top)
        f_imp_top10 = f_imp.sort_values(ascending=False).head(10).sort_values(ascending=True)

        # Create the plot
        fig, ax = plt.subplots(figsize=(10, 6))
        
        # Plot the horizontal bars
        bars = ax.barh(
            f_imp_top10.index, 
            f_imp_top10.values, 
            color='steelblue', 
            label='Feature Contribution (Gain)' # <-- Label for the legend
        )
        
        # --- Style for minimalistic PPT snapshot ---
        ax.set_title('Key Factors in Parking Prediction')
        ax.set_xlabel('Importance Score (Contribution)')
        
        # Remove chart junk (spines, grid, ticks)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.grid(False) 
        ax.tick_params(axis='y', length=0)
        
        # Add the legend, as requested
        ax.legend(loc='lower right', frameon=False)
        
        # Add data labels (values) to the end of each bar
        for bar in bars:
            width = bar.get_width()
            ax.text(
                width * 1.01, # Position slightly after the bar
                bar.get_y() + bar.get_height() / 2,
                f'{width:.3f}', # Format the value (e.g., 0.123)
                va='center',
                ha='left',
                fontsize=9
            )
        
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        print("Saved simplified feature_importance.png")
        plt.close(fig)

    except Exception as e:
        print(f"Error generating simplified feature importance: {e}")


def predict_availability(sample_data):
    """Loads the saved model and data to predict availability."""
    print("\n--- New Prediction ---")
    
    if not os.path.exists(MODEL_FILE) or not os.path.exists(MODEL_DATA_FILE):
        print("Error: Model or model data not found. Please run the training first.")
        return

    model = xgb.XGBClassifier()
    model.load_model(MODEL_FILE)
    model_data = joblib.load(MODEL_DATA_FILE)
    encoders = model_data['encoders']
    model_columns = model_data['model_columns']
    
    print(f"Predicting for: {sample_data}")

    df_sample = pd.DataFrame([sample_data])

    if 'hour' in df_sample.columns:
        df_sample['hour_sin'] = np.sin(2 * np.pi * df_sample['hour'] / 24.0)
        df_sample['hour_cos'] = np.cos(2 * np.pi * df_sample['hour'] / 24.0)
    if 'weekday' in df_sample.columns:
        df_sample['weekday_sin'] = np.sin(2 * np.pi * df_sample['weekday'] / 7.0)
        df_sample['weekday_cos'] = np.cos(2 * np.pi * df_sample['weekday'] / 7.0)

    try:
        for col, le in encoders.items():
            if col in df_sample.columns:
                df_sample[col] = df_sample[col].fillna('missing')
                
                # --- FIX for new/unseen labels in sample data ---
                # Check if the label is known, if not, map it to 'missing'
                current_label = df_sample[col].iloc[0]
                if current_label not in le.classes_:
                    print(f"Warning: Unknown label '{current_label}' for '{col}'. Treating as 'missing'.")
                    # Temporarily add 'missing' to classes if it's not there from training
                    if 'missing' not in le.classes_:
                        le.classes_ = np.append(le.classes_, 'missing')
                    df_sample[col] = 'missing'

                df_sample[col] = le.transform(df_sample[col])
                # --- End of FIX ---

    except Exception as e:
        print(f"Error transforming categorical data: {e} (Maybe an unknown label?)")
        return

    numerical_cols = ['poi_office_count', 'poi_restaurant_count', 'poi_store_count']
    for col in numerical_cols:
        if col in df_sample.columns:
            df_sample[col] = df_sample[col].fillna(0)
            
    df_pred = df_sample.reindex(columns=model_columns, fill_value=0)

    probabilities = model.predict_proba(df_pred)
    
    prob_free = probabilities[0][0]
    prob_occupied = probabilities[0][1]
    
    print(f"  -> Probability FREE:      {prob_free * 100:.2f}%")
    print(f"  -> Probability OCCUPIED: {prob_occupied * 100:.2f}%")
    
    return prob_occupied

# --- Main execution ---
if __name__ == "__main__":
    
    preprocess_and_train()
    
    # --- Sample data now EXACTLY matches simulation data ---
    
    sample_1 = {
        'slot_type': 'car', 'hour': 9, 'weekday': 1, 'weather': 'sunny',
        'event_type': 'none', 'poi_office_count': 30, 
        'poi_restaurant_count': 5, 'poi_store_count': 2
    }
    predict_availability(sample_1)

    sample_2 = {
        'slot_type': 'bike', 'hour': 22, 'weekday': 5, 'weather': 'sunny',
        'event_type': 'none', 'poi_office_count': 1, 
        'poi_restaurant_count': 2, 'poi_store_count': 2
    }
    predict_availability(sample_2)

    sample_3 = {
        'slot_type': 'car', 'hour': 18, 'weekday': 6, 'weather': 'rainy',
        'event_type': 'public_holiday', 'poi_office_count': 5,
        'poi_restaurant_count': 30, 'poi_store_count': 25
    }
    predict_availability(sample_3)

Loading data from pune_parking_hyper_realistic.csv...
Engineering features (cyclical time, NaNs)...
Encoding categorical features...
Model data (encoders, columns) saved to parking_model_data_v2.joblib
Training XGBoost model...
Model training complete.

--- Model Evaluation ---
Accuracy: 0.7730

Classification Report:
              precision    recall  f1-score   support

    Free (0)       0.80      0.80      0.80    101644
Occupied (1)       0.74      0.74      0.74     76916

    accuracy                           0.77    178560
   macro avg       0.77      0.77      0.77    178560
weighted avg       0.77      0.77      0.77    178560

Generating evaluation plots...
Saved confusion_matrix.png
Saved feature_importance.png

Model saved to parking_model_v2.json

--- New Prediction ---
Predicting for: {'slot_type': 'car', 'hour': 9, 'weekday': 1, 'weather': 'sunny', 'event_type': 'none', 'poi_office_count': 30, 'poi_restaurant_count': 5, 'poi_store_count': 2}
  -> Probability FREE:     