In [15]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)
from catboost import CatBoostRegressor, Pool
from rapidfuzz import process
import shap
import matplotlib.pyplot as plt
from scipy.stats import linregress
import seaborn as sns
import os
import time


class Preprocessing:
    def __init__(self, income_path, zipcode_path, property_path):
        self.income_path = income_path
        self.zipcode_path = zipcode_path
        self.property_path = property_path
        """
        Preprocessing class:
        This is where I match the Bpost location data and municipalities with the Statbel
        Median income information.        

        """
    def read_merge_external(self):
        # Read income and zipcode data
        income_data = pd.read_csv(self.income_path)
        zipcode_data = pd.read_excel(self.zipcode_path)

        # DEBUG: Print the columns of the input dataframes
        print("Income Data Columns:", income_data.columns)
        print("Zipcode Data Columns:", zipcode_data.columns)
        #Clean Income Data
        income_data_new_header = ["Locality", "median_income", "unnamed", "GPS", "locality"]
        income_data.columns = income_data_new_header
        income_data = income_data.drop(columns=["Locality", "unnamed"])

        # Ensure all column names are lowercase for consistent renaming
        zipcode_data.columns = zipcode_data.columns.str.lower()

        # Rename columns in the zipcode_data dataframe
        if "main municipality" in zipcode_data.columns:
            zipcode_data.rename(
                columns={"postcode": "postal_code", "provincie": "province", "name": "locality", "main municipality": "municipality"},
                inplace=True,
            )
        else:
            raise KeyError("The column 'MAIN MUNICIPALITY' is missing from the input file. Please check the file structure.")

        # Normalize text for merging
        zipcode_data.province = zipcode_data["province"].astype(str)
        zipcode_data.locality = zipcode_data["locality"].astype(str)
        zipcode_data.municipality = zipcode_data["municipality"].astype(str)
        zipcode_data.province = zipcode_data.province.apply(lambda x: x.strip().lower())
        zipcode_data.locality = zipcode_data.locality.apply(lambda x: x.strip().lower())
        zipcode_data.municipality = zipcode_data.municipality.apply(lambda x: x.strip().lower())
        income_data.locality = income_data.locality.apply(lambda x: x.strip().lower())
        income_data.locality = income_data["locality"].astype(str)

        # Merge postal code & province data from Bpost to income data
        for index, row in income_data.iterrows():
            id_locality = row["locality"]
            match = zipcode_data[zipcode_data["locality"] == id_locality]
            if not match.empty:
                income_data.at[index, "postal_code"] = match["postal_code"].values[0]
                income_data.at[index, "province"] = match["province"].values[0]

        # RapidFuzz matching: For unmatched rows, use fuzzy matching
        for index, row in income_data.iterrows():
            if pd.isnull(row["postal_code"]) or pd.isnull(row["province"]):
                locality = row["locality"]
                match = process.extractOne(locality, zipcode_data["locality"], score_cutoff=75)
                if match:
                    match_row = zipcode_data[zipcode_data["locality"] == match[0]]
                    income_data.at[index, "postal_code"] = match_row["postal_code"].values[0]
                    income_data.at[index, "province"] = match_row["province"].values[0]

        return income_data, zipcode_data


    def match_income(self, row, income_data, log_file="match_results.csv", unmatched_file="unmatched_results.csv"):
        """
        Match income based on postal code or locality.
        
        Priority:
        1. Exact match on postal code.
        2. Fuzzy match on locality.
        
        If no match is found, return None.
        Log each iteration to a CSV for double-checking.
        """
        # Create a dictionary to log results for this row
        log_data = {
            "postal_code": row['postal_code'],
            "locality": row['locality'],
            "province": row['province'],
            "match_type": "None",
            "matched_value": None,
            "median_income": None,
            "GPS": None
        }

        # Attempt exact match on postal code
        postal_matches = income_data[income_data['postal_code'] == row['postal_code']]
        if not postal_matches.empty:
            matched_income = postal_matches['median_income'].values[0]
            matched_gps = postal_matches['GPS'].values[0]
            log_data.update({
                "match_type": "Postal Code",
                "matched_value": row['postal_code'],
                "median_income": matched_income,
                "GPS": matched_gps
            })
            self._append_to_log(log_data, log_file)
            return matched_income, matched_gps

        
        # Fuzzy match on locality
        best_match = process.extractOne(row['locality'], income_data['locality'])
        if best_match and best_match[1] > 75:  # Ensure the match score is above a threshold
            matched_locality = best_match[0]
            matched_income = income_data[income_data['locality'] == matched_locality]['median_income'].values[0]
            matched_gps = income_data[income_data['locality'] == matched_locality]['GPS'].values[0]
            log_data.update({
                "match_type": "Locality",
                "matched_value": matched_locality,
                "median_income": matched_income,
                "GPS": matched_gps
            })
            self._append_to_log(log_data, log_file)
            return matched_income, matched_gps

        # No match found; log to the unmatched file
        self._append_to_log(log_data, unmatched_file)
        return None

    def _append_to_log(self, log_data, log_file):
        """
        Append log data to the specified CSV file.
        """
        # Convert the log data dictionary to a DataFrame
        log_df = pd.DataFrame([log_data])

        # Append to the CSV file
        try:
            # If the file exists, append without writing the header
            log_df.to_csv(log_file, mode='a', index=False, header=False)
        except FileNotFoundError:
            # If the file does not exist, write with the header
            log_df.to_csv(log_file, mode='w', index=False, header=True)

    def properties_dataset_cleaning(self, income_data):
        # Read and clean property data
        property_data = pd.read_csv(self.property_path)
        #Clear outliers above 5 million
        property_data = property_data[
            (property_data["price"] <= 5000000) & (property_data["price"] >= 40000)
        ]
        property_data = property_data[property_data["bedrooms"] <= 9]
        property_data["buildingState"] = property_data["buildingState"].replace(
            {
                "AS_NEW": 1,
                "JUST_RENOVATED": 2,
                "GOOD": 3,
                "TO_RESTORE": 4,
                "TO_RENOVATE": 4,
                "TO_BE_DONE_UP": 4,
            }
        )
        property_data["province"] = property_data["province"].replace(
            {
                "flemish_brabant_extended": "flemish_brabant",
                "hainaut_extended": "hainaut_province",
                "flemish_brabant": "Flemish Brabant",
                "hainaut_province": "Hainaut",
            }
        )
        property_data.drop(columns=["buildingStateLabel"], inplace=True)

        # Fill and clean missing values
        property_data["terraceSurface"] = property_data["terraceSurface"].fillna(0)
        property_data.dropna(subset=["livingArea", "energy_certificate"], inplace=True)
        # Normalize locality column
        property_data["locality"] = property_data["locality"].apply(lambda x: x.strip().lower())

         # Add median_income and GPS columns
        property_data[['median_income', 'GPS']] = property_data.apply(
        lambda row: pd.Series(self.match_income(row, income_data)), 
        axis=1
        )
    
         # Export updated property data
        self.export_property_data(property_data)

        return property_data

    def export_property_data(self, property_data, base_filename='properties_data_cleaned'):
        """
        Export the processed property data to a CSV file.
        
        """
        # Create exports directory if it doesn't exist
        exports_dir = os.path.join(os.path.dirname(self.property_path), 'exports')
        os.makedirs(exports_dir, exist_ok=True)

        # Generate filename with timestamp
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.csv"
        full_path = os.path.join(exports_dir, filename)

        # Export to CSV
        property_data.to_csv(full_path, index=False)
        
        print(f"Property data exported to: {full_path}")
        
        return full_path

class FeatureEngineering:
    @staticmethod
    def add_region_column(df):
        flanders_provinces = [
            "Antwerp",
            "East Flanders",
            "Flemish Brabant",
            "Limburg",
            "West Flanders",
        ]
        wallonia_provinces = [
            "Liège",
            "Luxembourg",
            "Walloon Brabant",
            "Namur",
            "Hainaut",
        ]
        df["region"] = df["province"].apply(
            lambda province: "Flanders"
            if province in flanders_provinces
            else "Wallonia"
            if province in wallonia_provinces
            else "Brussels"
        )
        return df

    @staticmethod
    def split_gps_coordinates(df):
        """
        Splits the GPS coordinates column into latitude and longitude.
        """
        df[['latitude', 'longitude']] = df['GPS'].str.split(' ', expand=True).astype(float)
        df.drop(columns=["GPS"], inplace=True)
        return df
    
    @staticmethod
    def scale_median_income(df):
        df["median_income"]=df["median_income"].apply(lambda x: float(x) * 1000)
        return df

    @staticmethod
    def remove_outliers_iqr(df):
        # Interquartile Range (IQR) Method
        q1 = df["price"].quantile(0.25)
        q3 = df["price"].quantile(0.75)
        iqr = q3 - q1
        df = df[(df["price"] >= q1 - 1.5 * iqr) & (df["price"] <= q3 + 1.5 * iqr)]
        return df


class ModelApply:
    @staticmethod
    def train_model(df):
        # Prepare data for training
        X = df.drop(
            columns=[
                "price",
                "kitchen",
                "postal_code",
                "furnished",
                "fireplace",
                "province",
                "property_type",
                "terraceSurface",
            ]
        )
        y = df["price"]

        # Categorical features
        categorical_features = ["locality", "energy_certificate", "region"]
        
        
        # KFold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        feature_importances = []

        params = {
            "random_strength": 5,
            "learning_rate": 0.07,
            "l2_leaf_reg": 7,
            "iterations": 1500,
            "depth": 6,
            "eval_metric": "RMSE",
            "verbose": 100,
            "random_seed": 42,
            "cat_features": categorical_features,
        }

        # Train with KFold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model = CatBoostRegressor(**params)
            model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)
            feature_importances.append(model.get_feature_importance())
            model.save_model("catboost_model_2.0.cbm")

        return model, X_train, X_test, y_train, y_test, feature_importances


class ModelEvaluation:
    
    @staticmethod
    def evaluate_model(model, X_train, y_train, X_test, y_test):
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Functions for MAPE and sMAPE
        def mape(y_true, y_pred):
            """Mean Absolute Percentage Error"""
            return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

        def smape(y_true, y_pred):
            """Symmetric Mean Absolute Percentage Error"""
            return 100 * np.mean(2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred)))
        
        # Metrics
        metrics = {
            "MAE_train": mean_absolute_error(y_train, y_train_pred),
            "MAE_test": mean_absolute_error(y_test, y_test_pred),
            "RMSE_train": np.sqrt(mean_squared_error(y_train, y_train_pred)),
            "RMSE_test": np.sqrt(mean_squared_error(y_test, y_test_pred)),
            "R2_train": r2_score(y_train, y_train_pred),
            "R2_test": r2_score(y_test, y_test_pred),
            "MAPE_train": mape(y_train, y_train_pred),
            "MAPE_test": mape(y_test, y_test_pred),
            "sMAPE_train": smape(y_train, y_train_pred),
            "sMAPE_test": smape(y_test, y_test_pred),
        }

        return metrics

    @staticmethod
    def shap_analysis(model, X_test, y_test, categorical_features):
        """
        Performs SHAP analysis on the model to understand feature contributions and feature interactions.

        Includes both gain-based and mean split importance analyses.

        Outputs:
            SHAP summary plot, dependence plots, and interaction plots (both gain-based and mean split) are saved as images.
        """
        # Create a test Pool with categorical features
        test_pool = Pool(X_test, y_test, cat_features=categorical_features)

        # Initialize SHAP TreeExplainer
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(test_pool)

        # Create directory to store SHAP outputs
        shap_output_dir = "shap_outputs"
        os.makedirs(shap_output_dir, exist_ok=True)

        # Plotting to prevent cutoff
        plt.rcParams.update({
            'figure.autolayout': True,
            'figure.figsize': (12, 8),  # Larger figure size
            'figure.constrained_layout.use': True,
        })

        # SHAP Summary Plot (global feature importance, mean split)
        plt.figure(figsize=(14, 10))
        shap.summary_plot(shap_values, X_test, show=False, plot_type='bar')
        plt.title("SHAP Summary Plot (Mean Split)")
        plt.tight_layout(pad=3.0)  # Add extra padding
        plt.savefig(f"{shap_output_dir}/shap_summary_plot_mean.png", bbox_inches='tight', dpi=300)
        plt.close()
        print(f"SHAP Summary Plot (mean split) saved to {shap_output_dir}/shap_summary_plot_mean.png")

        # Gain-Based Feature Importances
        gain_importances = model.get_feature_importance(type='PredictionValuesChange')
        gain_importance_indices = np.argsort(gain_importances)[::-1]  # Sort in descending order
        gain_top_features = [X_test.columns[i] for i in gain_importance_indices[:5]]  # Top 5 features

        # SHAP Summary Plot for Gain-Based Importance
        plt.figure(figsize=(14, 10))
        shap.summary_plot(shap_values, X_test, show=False, plot_type="bar")
        plt.title("SHAP Summary Plot (Gain-Based)")
        plt.tight_layout(pad=3.0)
        plt.savefig(f"{shap_output_dir}/shap_summary_plot_gain.png", bbox_inches='tight', dpi=300)
        plt.close()
        print(f"SHAP Summary Plot (gain-based split) saved to {shap_output_dir}/shap_summary_plot_gain.png")

        # Generate SHAP Dependence Plots (Mean Split)
        for feature in X_test.columns:
            plt.figure(figsize=(12, 8))
            shap.dependence_plot(feature, shap_values, X_test, show=False)
            plt.title(f"SHAP Dependence Plot: {feature} (Mean Split)")
            plt.tight_layout(pad=3.0)
            plt.savefig(f"{shap_output_dir}/shap_dependence_mean_{feature}.png", bbox_inches='tight', dpi=300)
            plt.close()
            print(f"SHAP Dependence Plot (mean split) for {feature} saved to {shap_output_dir}/shap_dependence_mean_{feature}.png")

        # Generate SHAP Dependence Plots (Gain-Based)
        for feature in gain_top_features:
            plt.figure(figsize=(12, 8))
            shap.dependence_plot(feature, shap_values, X_test, show=False)
            plt.title(f"SHAP Dependence Plot: {feature} (Gain-Based)")
            plt.tight_layout(pad=3.0)
            plt.savefig(f"{shap_output_dir}/shap_dependence_gain_{feature}.png", bbox_inches='tight', dpi=300)
            plt.close()
            print(f"SHAP Dependence Plot (gain-based split) for {feature} saved to {shap_output_dir}/shap_dependence_gain_{feature}.png")

        # Generate SHAP Interaction Plots (Mean Split)
        mean_split_top_features_indices = np.argsort(np.abs(shap_values).mean(0))[-5:]  # Top 5 features by mean split
        mean_split_top_features = [X_test.columns[i] for i in mean_split_top_features_indices]

        for i, feature_x in enumerate(mean_split_top_features):
            for j, feature_y in enumerate(mean_split_top_features):
                if i != j:  # Avoid self-interactions
                    plt.figure(figsize=(12, 8))
                    shap.dependence_plot(feature_x, shap_values, X_test, show=False, interaction_index=feature_y)
                    plt.title(f"SHAP Interaction Plot: {feature_x} vs {feature_y} (Mean Split)")
                    plt.tight_layout(pad=3.0)
                    plt.savefig(f"{shap_output_dir}/shap_interaction_mean_{feature_x}_vs_{feature_y}.png", bbox_inches='tight', dpi=300)
                    plt.close()
                    print(f"SHAP Interaction Plot (mean split) for {feature_x} vs {feature_y} saved to {shap_output_dir}/shap_interaction_mean_{feature_x}_vs_{feature_y}.png")

        # Generate SHAP Interaction Plots (Gain-Based)
        for i, feature_x in enumerate(gain_top_features):
            for j, feature_y in enumerate(gain_top_features):
                if i != j:  # Avoid self-interactions
                    plt.figure(figsize=(12, 8))
                    shap.dependence_plot(feature_x, shap_values, X_test, show=False, interaction_index=feature_y)
                    plt.title(f"SHAP Interaction Plot: {feature_x} vs {feature_y} (Gain-Based)")
                    plt.tight_layout(pad=3.0)
                    plt.savefig(f"{shap_output_dir}/shap_interaction_gain_{feature_x}_vs_{feature_y}.png", bbox_inches='tight', dpi=300)
                    plt.close()
                    print(f"SHAP Interaction Plot (gain-based split) for {feature_x} vs {feature_y} saved to {shap_output_dir}/shap_interaction_gain_{feature_x}_vs_{feature_y}.png")




    
    
    @staticmethod
    def plot_training_validation_loss(model, output_dir):
        """
        Plot training and validation loss across iterations
        
        Args:
            model (CatBoostRegressor): Trained CatBoost model
            output_dir (str): Directory to save the plot
        """
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Extract learning curves
        train_loss = model.get_evals_result()['learn']['RMSE']
        validation_loss = model.get_evals_result()['validation']['RMSE']
        
        # Create plot
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(train_loss) + 1), train_loss, label='Training Loss (RMSE)', color='blue')
        plt.plot(range(1, len(validation_loss) + 1), validation_loss, label='Validation Loss (RMSE)', color='red')
        
        plt.title('Training vs Validation Loss')
        plt.xlabel('Iterations')
        plt.ylabel('RMSE')
        plt.legend()
        plt.tight_layout()
        
        # Save the plot
        plt.savefig(os.path.join(output_dir, 'model2.0_training_validation_loss.png'))
        plt.close()

    @staticmethod
    def plot_prediction_scatter(y_test, y_pred, output_dir):
        """
        Create a scatter plot of actual vs predicted values with linear regression line
        
        """
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Create scatter plot
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5, color='blue', label='Predictions')
        
        # Compute linear regression
        slope, intercept, r_value, p_value, std_err = linregress(y_test, y_pred)
        line = slope * y_test + intercept
        
        # Plot regression line
        plt.plot(y_test, line, color='red', label=f'Regression Line (R²: {r_value**2:.4f})')
        
        plt.title('Actual vs Predicted Values')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.legend()
        plt.tight_layout()
        
        # Save the plot
        plt.savefig(os.path.join(output_dir, 'model2.0_actual_vs_predicted_scatter.png'))
        plt.close()

    @staticmethod
    def export_evaluation_results(metrics, feature_importances, X_train, file_path, model=None, y_test=None, y_pred=None, export_type="csv"):
        """
        Extended export method to include additional visualizations
        
    
        """
        # Create evaluation metrics directory
        output_dir = os.path.dirname(file_path)
        evaluation_metrics_dir = os.path.join(output_dir, 'model2.0__evaluation_metrics')
        os.makedirs(evaluation_metrics_dir, exist_ok=True)

        # Create SHAP outputs directory
        shap_output_dir = os.path.join(evaluation_metrics_dir, 'model2.0__shap_outputs')
        os.makedirs(shap_output_dir, exist_ok=True)

        # Save Metrics
        metrics_file_path = os.path.join(evaluation_metrics_dir, 'model2.0__evaluation_results.csv')
        if export_type == "csv":
            pd.DataFrame([metrics]).to_csv(metrics_file_path, index=False)
        elif export_type in ("txt", "md"):
            with open(metrics_file_path, "w") as f:
                f.write("# Model Evaluation Results\n\n" if export_type == "md" else "")
                for key, value in metrics.items():
                    f.write(f"- **{key}**: {value}\n" if export_type == "md" else f"{key}: {value}\n")

        # Dynamically retrieve feature names
        feature_names = X_train.columns.tolist()

        # Aggregate feature importances by taking the mean across folds
        mean_feature_importances = (
            np.mean(feature_importances, axis=0) if isinstance(feature_importances, list) else feature_importances
        )

        # Create a DataFrame for feature importances
        feature_importance_df = pd.DataFrame(
            {"Feature": feature_names, "Importance": mean_feature_importances}
        ).sort_values(by="Importance", ascending=False)

        # Save Feature Importances Plot
        plt.figure(figsize=(10, 6))
        sns.barplot(x="Importance", y="Feature", data=feature_importance_df, palette="viridis")
        plt.title("Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Features")
        plt.tight_layout()
        plt.savefig(os.path.join(evaluation_metrics_dir, 'model2.0__feature_importances.png'))
        plt.close()

        # Additional Visualizations (if model and predictions are provided)
        if model is not None:
            # Plot Training vs Validation Loss
            ModelEvaluation.plot_training_validation_loss(model, evaluation_metrics_dir)
        
        if y_test is not None and y_pred is not None:
            # Plot Actual vs Predicted Scatter
            ModelEvaluation.plot_prediction_scatter(y_test, y_pred, evaluation_metrics_dir)

        print(f"Evaluation metrics and visualizations saved to {evaluation_metrics_dir}")


# Main Function
def main():
    # Paths
    script_start_time = time.time()
    print("Script started.")
    income_path = "/Users/irisvirus/Desktop/Becode/Python/Projects/Deployment/Immo_deployment/utils/INCOME DATA 2022.csv"
    zipcode_path = "/Users/irisvirus/Desktop/Becode/Python/Projects/Deployment/Immo_deployment/utils/zipcodes_num_nl_new_Tumi.xls"
    property_path = "/Users/irisvirus/Desktop/Becode/Python/Projects/Deployment/Immo_deployment/utils/properties_data_cleaned_05_12_14H30.csv"

    # Preprocessing
    preprocessing = Preprocessing(income_path, zipcode_path, property_path)
    income_data, zipcode_data = preprocessing.read_merge_external()
    property_data = preprocessing.properties_dataset_cleaning(income_data)

    # Feature Engineering
    feature_engineering = FeatureEngineering()
    property_data = feature_engineering.split_gps_coordinates(property_data)
    property_data = feature_engineering.scale_median_income(property_data)
    property_data = feature_engineering.add_region_column(property_data)
    property_data = feature_engineering.remove_outliers_iqr(property_data)
    preprocessing.export_property_data(property_data)


    # Model Training
    training_start_time = time.time()
    print("Model training started...")
    model_apply = ModelApply()
    model, X_train, X_test, y_train, y_test, feature_importances = model_apply.train_model(property_data)
    
    training_end_time = time.time()
    training_duration = training_end_time - training_start_time
    print(f"Model training completed in {training_duration:.2f} seconds.")
    # Model Evaluation
    evaluation_start_time = time.time()
    print("Evaluation started...")
    evaluation = ModelEvaluation()
    metrics = ModelEvaluation.evaluate_model(model, X_train, y_train, X_test, y_test)
    # SHAP Analysis
    ModelEvaluation.shap_analysis(model, X_test, y_test, ["locality", "energy_certificate", "region"])
    
    #Gain based importance
    #gain_importances = evaluation.calculate_gain_importance(model)
    # y_test_pred for scatter plot and loss curves 
    y_test_pred = model.predict(X_test)

    # Export Evaluation Results
    print("Exporting evaluation results...")
    file_path = "evaluation_results"  # Replace with your desired output directory
    evaluation.export_evaluation_results(
        metrics=metrics,
        feature_importances=feature_importances,
        X_train=X_train,
        file_path=file_path,
        model=model,
        y_test=y_test,
        y_pred=y_test_pred,
        export_type="csv",
    )
    
    time.sleep(2)  
    evaluation_end_time = time.time()
    # Evaluation duration
    evaluation_duration = evaluation_end_time - evaluation_start_time
    print(f"Evaluation completed in {evaluation_duration:.2f} seconds.")

    # Capture the script end time
    script_end_time = time.time()

    # Total script execution time
    script_execution_time = script_end_time - script_start_time
    print(f"Total script execution time: {script_execution_time:.2f} seconds.")


if __name__ == "__main__":
    main()

Script started.
Income Data Columns: Index(['AARTSELAAR', '30.562', 'Unnamed: 2', '51.1319 4.3827', 'Aartselaar'], dtype='object')
Zipcode Data Columns: Index(['Postcode', 'NAME', 'SUBMUNICIPALITY', 'MAIN MUNICIPALITY',
       'Provincie'],
      dtype='object')


  property_data["buildingState"] = property_data["buildingState"].replace(


Property data exported to: /Users/irisvirus/Desktop/Becode/Python/Projects/Deployment/Immo_deployment/utils/exports/properties_data_cleaned_20241213_141135.csv
Property data exported to: /Users/irisvirus/Desktop/Becode/Python/Projects/Deployment/Immo_deployment/utils/exports/properties_data_cleaned_20241213_141135.csv
Model training started...
0:	learn: 161013.4038385	test: 158585.4976450	best: 158585.4976450 (0)	total: 8.15ms	remaining: 12.2s
100:	learn: 94750.8279059	test: 91115.8783529	best: 91115.8783529 (100)	total: 535ms	remaining: 7.42s
200:	learn: 84719.5118196	test: 83582.2299521	best: 83582.2299521 (200)	total: 1.07s	remaining: 6.91s
300:	learn: 80269.3291566	test: 81224.3839525	best: 81222.0678531 (299)	total: 1.78s	remaining: 7.1s
400:	learn: 77710.9304618	test: 80318.0250214	best: 80309.3983294 (391)	total: 2.34s	remaining: 6.42s
500:	learn: 75753.8137354	test: 79907.9051594	best: 79907.9051594 (500)	total: 2.88s	remaining: 5.75s
600:	learn: 74045.6397364	test: 79608.58338

  fig = pl.figure(figsize=figsize)


SHAP Interaction Plot (mean split) for region vs surfaceOfThePlot saved to shap_outputs/shap_interaction_mean_region_vs_surfaceOfThePlot.png


  plt.figure(figsize=(12, 8))


SHAP Interaction Plot (mean split) for region vs energy_certificate saved to shap_outputs/shap_interaction_mean_region_vs_energy_certificate.png
SHAP Interaction Plot (mean split) for region vs livingArea saved to shap_outputs/shap_interaction_mean_region_vs_livingArea.png
SHAP Interaction Plot (mean split) for longitude vs region saved to shap_outputs/shap_interaction_mean_longitude_vs_region.png
SHAP Interaction Plot (mean split) for longitude vs surfaceOfThePlot saved to shap_outputs/shap_interaction_mean_longitude_vs_surfaceOfThePlot.png
SHAP Interaction Plot (mean split) for longitude vs energy_certificate saved to shap_outputs/shap_interaction_mean_longitude_vs_energy_certificate.png
SHAP Interaction Plot (mean split) for longitude vs livingArea saved to shap_outputs/shap_interaction_mean_longitude_vs_livingArea.png
SHAP Interaction Plot (mean split) for surfaceOfThePlot vs region saved to shap_outputs/shap_interaction_mean_surfaceOfThePlot_vs_region.png
SHAP Interaction Plot (me


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feature_importance_df, palette="viridis")


Evaluation metrics and visualizations saved to model2.0__evaluation_metrics
Evaluation completed in 129.54 seconds.
Total script execution time: 203.62 seconds.


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>