### Spark Session & Data Loading

In [0]:
# Initialize pyspark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder.master("local[*]")
    .config("spark.driver.memory", "12g")
    .appName("Partitioning")
    .getOrCreate()
)

sc = spark.sparkContext

In [0]:
# Load and cache data from "features.csv"
df = spark.read.csv("features.csv", header=True, inferSchema=True)
df.cache()

# Reorder the columns in the DataFrame such that the "outcome" column is the last one
columns = df.columns
columns_reordered = [col_name for col_name in columns if col_name != "outcome"] + [
    "outcome"
]
df = df.select(*columns_reordered)

In [0]:
df.printSchema()

In [0]:
df.show(5)

### Time analysis functions


##### Time analysis

In [0]:
from time import time
from pyspark.sql import DataFrame


def time_analysis(df: DataFrame, func) -> dict:
    """
    Apply the provided function to different fractions (subsets) of the DataFrame and measure the time taken
    for each function run.

    Parameters:
    df : DataFrame
        The DataFrame to apply the function to.
    func : function
        The function to apply to the DataFrame. The function should take a DataFrame as its only argument.

    Returns:
    dict
        A dictionary containing the fraction of DataFrame and the corresponding time taken for the function to run.
    """
    results = {}
    df_length = df.count()

    for percentage in range(10, 110, 10):
        # Calculate the number of rows for the given percentage
        num_rows = int(df_length * (percentage / 100))
        partial_df = df.limit(num_rows)

        # Measure the time taken to apply the function
        start_time = time()
        func(partial_df)
        end_time = time()

        # Store the results
        results[float(percentage / 100)] = end_time - start_time

    return results

##### Plot time analysis

In [0]:
import matplotlib.pyplot as plt

plt.rcParams.update({"font.size": 16})


def plot_time_analysis(times_partitioning: dict, plot_title: str):
    """
    Plot the time taken for the function to run for different fractions of the DataFrame.

    Parameters:
    times_partitioning : dict
        A dictionary containing the fraction of DataFrame and the corresponding time taken for the function to run.
    plot_title : str
        The title of the plot.
    """

    plt.plot(list(times_partitioning.keys()), list(times_partitioning.values()))

    plt.xlabel("Fraction of DataFrame")

    plt.ylabel("Time taken (s)")
    # plt.title(plot_title)

    plot_filename = "".join(e for e in plot_title if e.isalnum() or e == "_")

    plt.tight_layout()

    plt.savefig(f"{plot_filename}.pdf")

    plt.show()

### Preprocessing

In [0]:
# We'll just use line number for the bidder id
df = df.drop("bidder_id", "payment_account", "address").withColumnRenamed(
    "_c0", "bidder_id"
)

In [0]:
# Drop rows will nulls
df = df.dropna()

In [0]:
df.show(5)

In [0]:
def print_sample_counts(df: DataFrame):
    """
    Print the number and percentage of samples in the DataFrame: Total; outcome = 1; outcome = 0.

    Parameters:
    df : DataFrame - The DataFrame to count samples from.
    """
    # Number of samples in df
    total_samples = df.count()
    print("Total samples:", total_samples)

    # Number of samples where outcome = 1
    outcome_1_samples = df.filter(df.outcome == 1).count()
    print("Samples where outcome = 1:", outcome_1_samples)

    # Number of samples where outcome = 0
    outcome_0_samples = df.filter(df.outcome == 0).count()
    print("Samples where outcome = 0:", outcome_0_samples)

    # Percentage of samples where outcome = 1
    outcome_1_percentage = (outcome_1_samples / total_samples) * 100
    print(f"Percentage of samples where outcome = 1: {outcome_1_percentage:.2f}%")

    # Print the number of partitions of df
    num_partitions = df.rdd.getNumPartitions()
    print("Number of partitions:", num_partitions)

In [0]:
print_sample_counts(df)

### Oversampling & Test/Train split

In [0]:
# Perform a train-test split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=21)

"""
Over-sampling the minority class (outcome = 1) in the training set by a factor of 4.
This will help to balance the classes and improve the model's performance.
"""
outcome_1_samples = train_df.filter(train_df.outcome == 1)
train_df = train_df.union(outcome_1_samples.sample(True, 4.0, seed=42))

print_sample_counts(train_df)
print_sample_counts(test_df)

### Partitioning

##### Partitioning function

In [0]:
import random


def rf_partition_dataframe(
    df: DataFrame,
    num_partitions: int = 10,
    partition_size: float = 0.8,
    seed: int = 2137,
) -> DataFrame:
    """
    Randomly sample num_partitions of samples from the original dataframe for each partition. The resulting dataframe will be
    (num_partitions * partition_size) times as large as the original dataframe, divided into num_partitions partitions.
    This is intended for use with Random Forests utilizing local approach.

    Parameters:
        df : DataFrame
            The original dataframe.
        num_partitions : int, optional
            The number of partitions to create. Default is 10.
        partition_size : float, optional
            The size of each partition as a fraction of the original dataframe. Default is 0.8.
        seed : int, optional
            The seed for the random number generator. Default is 2137. After function execution, the random number generator
            will be restored to its previous state.

    Returns:
        DataFrame
            The resulting dataframe with (num_partitions * partition_size) times as large as the original dataframe,
            divided into num_partitions partitions.

    Raises:
        ValueError
            If num_partitions is less than 1 or partition_size is not in the range (0, 1].
    """

    # Check if "num_partitions" is valid
    if num_partitions < 1:
        raise ValueError(
            "rf_partition_dataframe: num_partitions must be greater than or equal to 1."
        )

    # Check if "partition_size" is valid
    if partition_size <= 0 or partition_size > 1:
        raise ValueError(
            "rf_partition_dataframe: partition_size must be in the range (0, 1]."
        )

    # Save the current state of the random number generator and set the seed

    rng_state = random.getstate()

    random.seed(seed)

    # Create dataframe with data for initial partition

    partitioned_df = df.sample(
        withReplacement=False, fraction=partition_size, seed=random.randint(100, 1000)
    )

    partitioned_df = partitioned_df.withColumn("partition", F.lit(0))

    # Create additional partitions

    for partition_num in range(1, num_partitions):
        # Randomly sample "parition_size" of samples from each partition
        partition_df = df.sample(
            withReplacement=False,
            fraction=partition_size,
            seed=random.randint(100, 1000),
        )

        # Add a column to the sampled_df to indicate the partition it came from
        partition_df = partition_df.withColumn("partition", F.lit(partition_num))

        # Add partition_df to sampled_df
        partitioned_df = partitioned_df.union(partition_df)

    # Perform partitioning based on column "partition"
    partitioned_df = partitioned_df.repartitionByRange(
        num_partitions + 1, "partition"
    )  # No clue why it does need to be num_partitions + 1.
    # If I don't add 1, it shows correct (target) number of partitions but one of them holds two "virtual" partitions.

    # Restore the state of the random number generator

    random.setstate(rng_state)

    return partitioned_df

##### Check correctness of partitioning function

In [0]:
# Check correctness of partitioning

check_df = rf_partition_dataframe(train_df, num_partitions=6, partition_size=0.8)

# Print number of partitions
print("Number of partitions for check_df:", check_df.rdd.getNumPartitions())

# Check correctness of partitioning
check_df = check_df.withColumn("actual_partition", F.spark_partition_id())
check_df.groupBy("actual_partition", "partition").agg(
    F.count("*").alias("total_per_virtual_partition")
).show()

# Number of elements in each actual partition
check_df.groupBy("actual_partition").agg(
    F.count("*").alias("total_per_partition")
).show()

##### Time analysis - Partitioning

In [0]:
times_partitioning = time_analysis(
    train_df,
    lambda df: rf_partition_dataframe(
        df=df, num_partitions=10, partition_size=0.8, seed=2137
    ),
)


print(times_partitioning)

In [0]:
plot_time_analysis(times_partitioning, "Time Analysis of Partitioning")

### Prepare RDDs for Train and Test

In [0]:
# Train (Partition data for local approach RF)
print("Train:")
df_partitioned = rf_partition_dataframe(train_df, num_partitions=10).drop("partition")
rdd_train = df_partitioned.rdd.cache()
print(rdd_train.getNumPartitions())
print(rdd_train.glom().map(len).collect())

# Test (no partitioning)
print("Test:")
rdd_test = test_df.rdd.repartition(3).cache()
print(rdd_test.getNumPartitions())
print(rdd_test.glom().map(len).collect())

### Local approach Random Forest

##### Local model - build function

In [0]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# We will use all columns except "outcome" as features, as feature selection revealed that all columns are important
selected_features = df_partitioned.columns


def build_local_tree(partition_iter, parameters: dict):
    """
    Build a Decision Tree Classifier using the data in the partition_iter.

    Parameters:
        partition_iter : iterator
            An iterator containing the data for training the Decision Tree Classifier.
        parameters : dict
            The hyperparameters for the Decision Tree Classifier.

    Returns:
        list
            An iterable containing the trained Decision Tree Classifier.
    """

    # Convert the partition_iter to a pandas DataFrame
    pd_partition = pd.DataFrame(partition_iter, columns=selected_features)

    # Check if the pd_partition is empty
    if pd_partition.empty:
        return []  # Skip training if the partition is empty

    # Divide the dataframe into features (X_train) and target (y_train)
    X_train = pd_partition.drop("outcome", axis=1)
    y_train = pd_partition["outcome"]

    # Create and train a Decision Tree Classifier
    classifier = DecisionTreeClassifier(**parameters)

    # Fit the classifier to the training data
    model = classifier.fit(X_train.values, y_train)
    return [model]

##### Training

In [0]:
# Example hyperparameters for the Decision Tree Classifier
example_parameters = {
    "criterion": "gini",
    "max_depth": 5,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": None,
    "class_weight": "balanced",
}

build_wrapper = lambda partition_iter: build_local_tree(
    partition_iter, example_parameters
)
models = rdd_train.mapPartitions(build_wrapper).collect()
models

##### Predictions

In [0]:
def predict(instance, models):
    """
    Predict the outcome for a given instance using the trained models.
    Requires a wrapper to be used with the rdd_test.map() function.

    Parameters:
        instance : list
            A list containing the features of the instance.
        models : list
            A list containing the trained models.

    Returns:
        int
            The predicted outcome for the instance.
    """
    # Get input features
    features = instance[:-1]

    # Predict the outcome for the instance using all the trained models
    predictions = [model.predict([features])[0] for model in models]

    # Return the most common prediction
    return predictions

In [0]:
def agg_predictions(predictions):
    """
    Aggregate the predictions for a given instance.

    Parameters:
        predictions : list
            A list containing the predictions for the instance.

    Returns:
        int
            The aggregated prediction for the instance.
    """
    # Count the number of 1s and 0s in the predictions
    num_1 = predictions.count(1)
    num_0 = predictions.count(0)

    # Return the most common prediction
    return 1.0 if num_1 > num_0 else 0.0

In [0]:
predict_wrapper = lambda instance: predict(instance, models)
preds = rdd_test.map(predict_wrapper).collect()
print(preds[:10])
preds_agg = rdd_test.map(predict_wrapper).map(agg_predictions).collect()
print(preds_agg[:10])

In [0]:
from pyspark.sql import Row


def transform(instance, models):
    """
    Transform the instance by adding the aggregated prediction to the end of the instance.
    Requires a wrapper to be used with the rdd_test.map() function.

    Parameters:
        instance : list
            A list containing the features of the instance.

    Returns:
        list
            A list containing the features of the instance and the aggregated prediction.
    """

    # Predict the outcome for the instance
    raw_prediction = agg_predictions(predict(instance, models))

    # Return the instance with the aggregated prediction
    return Row(**instance.asDict(), raw_prediction=raw_prediction)

In [0]:
transform_wrapper = lambda instance: transform(instance, models)
df_pred = rdd_test.map(transform_wrapper).toDF()

##### Evaluation

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# BinaryClassificationEvaluator().explainParams()

# Evaluator to calculate the area under the ROC curve
evaluator_localRF = BinaryClassificationEvaluator(
    labelCol="outcome", rawPredictionCol="raw_prediction", metricName="areaUnderROC"
)

In [0]:
evaluator_localRF.evaluate(df_pred)

##### Parameter Search CV

In [0]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best"],  # "random" is the other option, it's generally worse
    "max_depth": [
        5,
        10,
        15,
    ],  # Might need to increase, or try None. Will see when fitting.
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [
        None,
        "sqrt",
        "log2",
    ],  # None: all features, "sqrt": root of # features, "log2": log2 # features. Can also try float proportions.
    "class_weight": [
        None,
        "balanced",
    ],  # Balanced adjusts weights inverse prop. to class frequencies in input. Prob useful for us cos imbalance.
}

from sklearn.model_selection import ParameterGrid

# Create a list of hyperparameter combinations
hyperparameter_combinations = list(ParameterGrid(param_grid))

print("Number of hyperparameter combinations:", len(hyperparameter_combinations))
print(hyperparameter_combinations[0])

In [0]:
from tqdm import tqdm


def parameter_search_CV(
    df: DataFrame,
    param_grid: list,
    num_partitions: int = 10,
    folds: int = 4,
    verbose: bool = False,
) -> tuple:
    """
    Perform hyperparameter search using cross-validation on the given DataFrame.

    Parameters:
        df: DataFrame
            The DataFrame to perform hyperparameter search on.
        param_grid: list
            A list containing dictionaries of hyperparameters.
        num_partitions: int, optional
            The number of partitions to create, this will also serve as number of trees in the Random Forest. Default is 10.
        folds: int, optional
            The number of folds to use in cross-validation. Default is 4.
        verbose: bool, optional
            If set to True print the results of each hyperparameter combination. Default is False.

    Returns:
        tuple
            A tuple containing the best hyperparameters dict and the corresponding mean AUC value.
    """

    # Best hyperparameters and corresponding AUC
    best_hyperparameters = None
    best_auc = -1.0

    # Divide dataframes into k folds
    folds_dfs = df.randomSplit([1 / folds] * folds, seed=37)

    # Prepare train/validation splits
    train_validation_splits = []

    for fold in range(folds):
        # Validation DataFrame
        rdd_validation = folds_dfs[fold].rdd.repartition(2).cache()

        # Train
        train_dfs = [folds_dfs[i] for i in range(folds) if i != fold]
        df_train = train_dfs[0]

        for i in range(1, len(train_dfs)):
            df_train = df_train.union(train_dfs[i])

        # Partition the training datafram
        train_partitioned = rf_partition_dataframe(
            df_train, num_partitions=num_partitions, partition_size=0.9
        ).drop("partition")

        rdd_train = train_partitioned.rdd.cache()

        # Append the train/validation splits
        train_validation_splits.append((rdd_train, rdd_validation))

    # Perform hyperparameter search using cross-validation
    for param_dict in tqdm(param_grid):
        new_best = False

        # Initialize list to store AUC values for each fold
        auc_list = []

        # Prepare build_wrapper with current hyperparameters
        build_wrapper = lambda partition_iter: build_local_tree(
            partition_iter, param_dict
        )

        # Start time
        time_start = time()

        # Perform hyperparameter search using cross-validation
        for rdd_train, rdd_validation in train_validation_splits:
            # Train the models
            models = rdd_train.mapPartitions(build_wrapper).collect()

            # Transform the test data (predictions)
            transform_wrapper = lambda instance: transform(instance, models)
            df_pred = rdd_validation.map(transform_wrapper).toDF()

            # Evaluate the predictions
            evaluator = BinaryClassificationEvaluator(
                labelCol="outcome",
                rawPredictionCol="raw_prediction",
                metricName="areaUnderROC",
            )
            auc = evaluator.evaluate(df_pred)

            # Store the AUC value
            auc_list.append(auc)

        # Calculate the mean AUC value for the current hyperparameters
        mean_auc = sum(auc_list) / len(auc_list)

        # Update the best hyperparameters and AUC value if the current hyperparameters are better
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_hyperparameters = param_dict
            new_best = True

        # Print the results if verbose is enabled
        if verbose:
            print(f"Hyperparameters: {param_dict}")
            print(f"Mean AUC: {mean_auc}")
            print(f"New best: {new_best}")
            print(f"Time taken: {time() - time_start:.2f} seconds")
            print()

    print("Number of hyperparameter combinations searched through:", len(param_grid))
    print("Best hyperparameters:", best_hyperparameters)
    print("Best mean AUC:", best_auc)

    return best_hyperparameters, best_auc

In [0]:
# Find the best hyperparameters and corresponding AUC value
# chosen_hyperparameters_combinations = random.sample(hyperparameter_combinations, 3)
# best_params_localRF, best_auc_localRF = parameter_search_CV(
#     train_df,
#     chosen_hyperparameters_combinations,
#     num_partitions=10,
#     folds=4,
#     verbose=True,
# )

best_params_localRF, best_auc_localRF = parameter_search_CV(
    train_df,
    hyperparameter_combinations,
    num_partitions=10,
    folds=4,
    verbose=False,
)
# Best hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 4, 'splitter': 'best'}
# Best mean AUC: 0.9880130419272364

In [0]:
print(f"Best hyperparameters: {best_params_localRF}")
print(f"Best mean AUC: {best_auc_localRF}")

##### Time analysis - local approach RF

In [0]:
def time_analysis_train_and_predict(train_df: DataFrame) -> dict:
    """
    Time the training and prediction of the Random Forest model for different fractions of the training DataFrame.

    Parameters:
    train_df : DataFrame
        The training DataFrame to train the Random Forest model on.

    Returns:
    dict
        A dictionary containing the fraction of DataFrame and the corresponding time taken for the training and prediction.
    """
    results = {}
    df_length = train_df.count()

    for percentage in range(10, 110, 10):
        # Calculate the number of rows for the given percentage
        num_rows = int(df_length * (percentage / 100))
        partial_df = train_df.limit(num_rows)

        # Prepare partitions for Random Forest (not taken into account for timing)
        rdd_partial = (
            rf_partition_dataframe(partial_df, num_partitions=10)
            .drop("partition")
            .rdd.cache()
        )

        # Time train and prediction
        start_time = time()

        # Train
        build_wrapper = lambda partition_iter: build_local_tree(
            partition_iter, example_parameters
        )
        models = rdd_partial.mapPartitions(build_wrapper).collect()

        # Predict
        transform_wrapper = lambda instance: transform(instance, models)
        df_pred = rdd_test.map(transform_wrapper).toDF()

        end_time = time()

        # Store the results
        results[float(percentage / 100)] = end_time - start_time

    return results

In [0]:
times_train_and_predict_localRF = time_analysis_train_and_predict(train_df)

In [0]:
plot_time_analysis(
    times_train_and_predict_localRF,
    "Time Analysis of Training and Prediction with Local approach Random Forest",
)

##### Build the best model based on parameter search CV

In [0]:
best_params_localRF = {
    "class_weight": "balanced",
    "criterion": "entropy",
    "max_depth": 15,
    "max_features": "log2",
    "min_samples_leaf": 2,
    "min_samples_split": 4,
    "splitter": "best",
}


# Build model based on the best hyperparameters found
build_wrapper = lambda partition_iter: build_local_tree(
    partition_iter, best_params_localRF
)
best_models_localRF = rdd_train.mapPartitions(build_wrapper).collect()

# Predict
transform_wrapper = lambda instance: transform(instance, best_models_localRF)
df_pred = rdd_test.map(transform_wrapper).toDF()

# Evaluate
evaluation_best_localRF = evaluator_localRF.evaluate(df_pred)

print(
    f"Evaluation of the best model with Local approach Random Forest: {evaluation_best_localRF:.6f}"
)

In [0]:
df_pred.show(5)

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels_localRF = df_pred.select("raw_prediction", "outcome")
# Rename columns to fit MulticlassMetrics
predictionAndLabels_localRF = predictionAndLabels_localRF.withColumnRenamed(
    "raw_prediction", "prediction"
).withColumnRenamed("outcome", "label")

eval_lrf = MulticlassMetrics(predictionAndLabels_localRF.rdd)

accuracy_lrf = eval_lrf.accuracy
precision_lrf = eval_lrf.precision(1.0)
recall_lrf = eval_lrf.recall(1.0)
f1_lrf = eval_lrf.fMeasure(1.0, 1.0)
confusion_matrix_lrf = eval_lrf.confusionMatrix().toArray()


print(f"Accuracy: {accuracy_lrf:.6f}")
print(f"Precision: {precision_lrf:.6f}")
print(f"Recall: {recall_lrf:.6f}")
print(f"F1 Score: {f1_lrf:.6f}")
print("Confusion Matrix:")
print(confusion_matrix_lrf)

### Global Approach Random Forest

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator


def build_global_rf(
    train_df: DataFrame, parameters: dict = None, param_grid: dict = None
) -> RandomForestClassificationModel:
    """
    Builds a global random forest classifier model using the given training data.

    Optionally, hyperparameters can be provided for the random forest classifier,
    or a grid of hyperparameters for cross-validation.

    Parameters:
        train_df (Dataframe): The training data as a DataFrame.
        parameters (dict, optional): Optional parameters for the random forest classifier. Defaults to None.
        param_grid (list, optional): Optional grid of parameters for cross-validation. Defaults to None.

    Returns:
        RandomForestClassifierModel: The trained random forest classifier model.

    Raises:
        ValueError: If both 'parameters' and 'param_grid' are provided.
    """

    if (parameters is not None) and (param_grid is not None):

        raise ValueError(
            "Only one of 'parameters' and 'param_grid' should be provided."
        )
    else:  # Pylance is complaining unreachable after value error, this fixes.
        pass

    if parameters is None:
        parameters = {}

    # Create a vector assembler to combine features into a single vector column
    input_cols = [col for col in train_df.columns if col != "outcome"]
    assembler = VectorAssembler(inputCols=input_cols, outputCol="features")

    # Create a random forest classifier

    rf = RandomForestClassifier(
        labelCol="outcome", featuresCol="features", **parameters
    )
    if param_grid is not None:

        built_param_grid = ParamGridBuilder()
        for param_name, param_values in param_grid.items():
            built_param_grid = built_param_grid.addGrid(
                rf.getParam(param_name), param_values
            )
        built_param_grid = built_param_grid.build()

        cv = CrossValidator(
            estimator=rf,
            estimatorParamMaps=built_param_grid,
            evaluator=BinaryClassificationEvaluator(
                labelCol="outcome",
                metricName="areaUnderROC",
            ),
            numFolds=4,
            seed=42,  # Seed for reproducibility, can be changed/removed
        )
        pipeline = Pipeline(stages=[assembler, cv])
    else:

        pipeline = Pipeline(stages=[assembler, rf])

    # Fit the pipeline on the train set
    model = pipeline.fit(train_df)

    return model

In [0]:
param_grid = {
    "bootstrap": [
        True
    ],  # Good for our dataset to stop overfitting, reduces tree correlation
    "featureSubsetStrategy": [
        "auto"
    ],  # Number of features to consider for splitting at each node
    "subsamplingRate": [
        1.0
    ],  # Fraction of training data used for learning each decision tree
    "impurity": ["gini", "entropy"],
    "maxBins": [16, 32, 64],
    "maxDepth": [5, 10, 15],
    "minInfoGain": [0.0, 0.1, 0.2],
    "minInstancesPerNode": [1, 2, 4],
    "numTrees": [10, 20, 30],
}

global_rf_model = build_global_rf(train_df, param_grid=param_grid)
best_global_rf_model = global_rf_model.stages[1]
best_global_rf_model = best_global_rf_model.bestModel

In [0]:
best_global_rf_params = {
    "bootstrap": best_global_rf_model.getBootstrap(),
    "featureSubsetStrategy": best_global_rf_model.getFeatureSubsetStrategy(),
    "subsamplingRate": best_global_rf_model.getSubsamplingRate(),
    "impurity": best_global_rf_model.getImpurity(),
    "maxBins": best_global_rf_model.getMaxBins(),
    "maxDepth": best_global_rf_model.getMaxDepth(),
    "minInfoGain": best_global_rf_model.getMinInfoGain(),
    "minInstancesPerNode": best_global_rf_model.getMinInstancesPerNode(),
    "numTrees": best_global_rf_model.getNumTrees,
}

print(best_global_rf_params)
# {
#     "bootstrap": True,
#     "featureSubsetStrategy": "auto",
#     "subsamplingRate": 1.0,
#     "impurity": "gini",
#     "maxBins": 64,
#     "maxDepth": 15,
#     "minInfoGain": 0.0,
#     "minInstancesPerNode": 1,
#     "numTrees": 30,
# }

In [0]:
best_global_rf_params = {
    "bootstrap": True,
    "featureSubsetStrategy": "auto",
    "subsamplingRate": 1.0,
    "impurity": "gini",
    "maxBins": 64,
    "maxDepth": 15,
    "minInfoGain": 0.0,
    "minInstancesPerNode": 1,
    "numTrees": 30,
}
global_rf = build_global_rf(train_df, parameters=best_global_rf_params)
# global_rf = global_rf_model

In [0]:
predictions_global_rf = global_rf.transform(test_df)
evaluator_global_rf = BinaryClassificationEvaluator(
    labelCol="outcome", metricName="areaUnderROC"
)

evaluation_best_global_rf = evaluator_global_rf.evaluate(predictions_global_rf)
print(
    f"Evaluation of the best model with Global approach Random Forest: {evaluation_best_global_rf:.6f}"
)

In [0]:
eval_grf = MulticlassMetrics(predictions_global_rf.select("prediction", "outcome").rdd)

accuracy_grf = eval_grf.accuracy
precision_grf = eval_grf.precision(1.0)
recall_grf = eval_grf.recall(1.0)
f1_grf = eval_grf.fMeasure(1.0, 1.0)
confusion_matrix_grf = eval_grf.confusionMatrix().toArray()

print(f"Accuracy: {accuracy_grf:.6f}")
print(f"Precision: {precision_grf:.6f}")
print(f"Recall: {recall_grf:.6f}")
print(f"F1 Score: {f1_grf:.6f}")
print("Confusion Matrix:")
print(confusion_matrix_grf)

In [0]:
import pandas as pd


# Construct metrics dataframe
metrics = pd.DataFrame(
    [
        [evaluation_best_localRF, evaluation_best_global_rf],
        [accuracy_lrf, accuracy_grf],
        [precision_lrf, precision_grf],
        [recall_lrf, recall_grf],
        [f1_lrf, f1_grf],
    ],
    index=["Area Under ROC", "Accuracy", "Precision", "Recall", "F1"],
    columns=["Local RF", "Global RF"],
)

display(metrics)

# 	Local RF	Global RF
# Area Under ROC	0.804701	0.932821
# Accuracy	0.964200	0.964200
# Precision	0.388889	0.357143
# Recall	0.636364	0.454545
# F1	0.482759	0.400000

In [0]:
import seaborn as sns

# Increase font size
sns.set(font_scale=1.5)


def plot_confusion_matrix(confusion_matrix, title, axis=None):
    """
    Plot the confusion matrix as a heatmap.

    Parameters:
    confusion_matrix : np.ndarray
        The confusion matrix to plot.

    title : str
        The title of the plot.

    axis : matplotlib.axes.Axes, optional
        The axis on which to plot the confusion matrix. If not provided, a new figure will be created.
    """
    if axis is None:
        plt.figure(figsize=(6, 6))
        axis = plt.gca()

    sns.heatmap(
        confusion_matrix,
        annot=True,
        fmt="g",
        cmap="Blues",
        cbar=False,
        annot_kws={"size": 15},
        ax=axis,
    )
    axis.set_xlabel("Predicted")
    axis.set_ylabel("Actual")
    # axis.set_title(title)


plot_confusion_matrix(confusion_matrix_lrf, "Local RF Confusion Matrix")
plt.savefig("Local_RF_Confusion_Matrix_notitle.pdf")
plot_confusion_matrix(confusion_matrix_grf, "Global RF Confusion Matrix")
plt.savefig("Global_RF_Confusion_Matrix_notitle.pdf")

In [0]:
times_train_and_predict_global_rf = time_analysis(
    train_df,
    lambda train_df: build_global_rf(
        train_df, parameters=best_global_rf_params
    ).transform(test_df),
)

In [0]:
sns.reset_orig()
plot_time_analysis(
    times_train_and_predict_global_rf,
    "Time Analysis of Training and Prediction with Global approach Random Forest",
)