I am performing **bagging on XGBoost**. Although XGBoost is typically associated with **boosting**, it is flexible enough to be used in a bagging ensemble as well.

### Implementing Bagging with XGBoost
1. Train multiple XGBoost models on **random subsets** of my training data.
2. Average the predictions of these models (for regression).

This approach is also referred to as **"Randomized XGBoost"** or **"Bootstrap Aggregating with XGBoost"**.

In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.1.1"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.22.4"},
    {"name": "xgboost", "version": None},
    {"name": "sparkxgb", "version": None},
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])


tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.

xgboost is already installed.

sparkxgb is already installed.


In [None]:
!pip install numpy==1.22.4



In [None]:
import numpy
print(numpy.__version__)

1.22.4


In [None]:
!pip install sparkxgb



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

# Defining local resources directory
local_resources_path = "/resources"
os.makedirs(local_resources_path, exist_ok=True)

# Defining the source paths from mounted Google Drive
xgboost4j_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j_2.12-1.7.6.jar"
xgboost4j_spark_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Defining the destination paths in the instance's local file system
xgboost4j_dest = os.path.join(local_resources_path, "xgboost4j_2.12-1.7.6.jar")
xgboost4j_spark_dest = os.path.join(local_resources_path, "xgboost4j-spark_2.12-1.7.6.jar")

# Copying the files from Google Drive to the local instance
shutil.copyfile(xgboost4j_source, xgboost4j_dest)
shutil.copyfile(xgboost4j_spark_source, xgboost4j_spark_dest)

# Verifying that the files are copied
print(f"Jar Files copied to: {local_resources_path}")
print(os.listdir(local_resources_path))


Jar Files copied to: /resources
['xgboost4j-spark_2.12-1.7.6.jar', 'xgboost4j_2.12-1.7.6.jar']


In [None]:
from pyspark.sql import SparkSession

# Defining the path to the copied jar files in the local instance
jar_files = "/resources/xgboost4j_2.12-1.7.6.jar,/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Initializing Spark session with the JAR files
spark = SparkSession.builder \
    .appName("XGBoostRegressor") \
    .config("spark.driver.memory", "120g") \
    .config("spark.executor.memory", "120g") \
    .config("spark.driver.maxResultSize", "40g") \
    .config("spark.executor.memoryOverhead", "40g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .config("spark.jars", jar_files) \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.1.1


In [None]:
# Testing if sparkxgb is loaded properly
try:
    from sparkxgb import XGBoostRegressor

    model = XGBoostRegressor()
    print("sparkxgb loaded successfully!")
except Exception as e:
    print(f"Error loading sparkxgb: {e}")


sparkxgb loaded successfully!


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Ellicott City,15.0,98,21043,3600.0,V6,Red,False,22.0,69.9,22.0,288.0,Black,False,39.2831,203.7,RED,-76.810097,Chevrolet,7.0,Traverse,26498.0,2096,1.0,Hi Lo Auto Sales - Ellicott City,270.0,A,6-Speed Automatic,All-Wheel Drive,118.9,78.5,2017,18.5,78.1,10.61,14,0.23,0.02021,4,6,2020,3,21,38,37
1,Gasoline,Sedan,Cincinnati,25.0,33,45251,1400.0,I4,Gray,True,12.2,59.7,35.0,138.0,Black,False,39.230701,173.1,GRAY,-84.590103,Chevrolet,5.0,Sonic,10995.0,0,4.551724,Joseph Chevrolet,148.0,A,Automatic,Front-Wheel Drive,99.4,68.3,2012,30.0,76.4,11.06,4,1.46,1.34409,9,8,2020,8,19,29,29
2,Flex Fuel Vehicle,Pickup Truck,Arlington,22.690001,210,2476,6000.0,V8,White,True,36.0,77.9,29.469999,360.0,Black,False,42.422298,224.4,WHITE,-71.173203,GMC,3.0,Sierra 3500HD,29904.0,4927,4.0,Mirak Chevrolet,765.0,A,6-Speed Automatic,Four-Wheel Drive,133.6,80.1,2016,26.08,80.16,10.99,2,2.98,5.87783,12,2,2020,4,18,42,28
3,Gasoline,Sedan,Orem,27.0,44,84058,2500.0,I4,Red,True,13.2,57.3,36.0,184.0,Black,False,40.273602,180.3,RED,-111.694,Mazda,5.0,MAZDA3,19650.0,80,4.625,Orem Mazda,185.0,A,6-Speed Automatic,Front-Wheel Drive,106.3,70.7,2018,31.5,78.0,9.93,9,0.24,0.53375,29,7,2020,2,24,37,31
4,Gasoline,Pickup Truck,Anaheim,16.0,6,92806,6200.0,V8,White,True,26.0,74.2,22.0,420.0,Black,False,33.815701,229.3,WHITE,-117.874001,GMC,6.0,Sierra 1500,44486.0,2022,4.961538,Hardin Buick GMC,383.0,A,8-Speed Automatic,4X2,143.5,80.0,2018,19.0,86.2,10.04,12,4.87,2.12459,6,9,2020,2,31,46,38




---



# **Bagging**

## **100k records**

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from sparkxgb import XGBoostRegressor
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Parameters
num_models = 3  # Number of XGBoost models to train
sample_fraction = 0.8  # Fraction of data for each subset
seed = 42  # Seed for reproducibility

start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:
    df_sample = df.sample(fraction=0.033, seed=seed)  # Randomly sample ~100k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if needed
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    transformed_df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = transformed_df_sample.randomSplit([0.8, 0.2], seed=seed)
    pbar.update(1)

    # Caching the test data for faster access
    test_df = test_df.cache()
    pbar.update(1)


# Initializing an empty DataFrame for storing combined predictions
all_predictions = None
print("\n")

# Training multiple XGBoost Regressor models on different subsets of training data
for i in range(num_models):
    print(f"Training XGB model {i + 1}...")

    # Sampling a random subset of training data
    train_subset = train_df.sample(fraction=sample_fraction, seed=seed + i)

    # Training XGBoost Regressor model
    xgb = XGBoostRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=6,                         # Maximum depth of each tree
        numRound=100,                       # Number of boosting rounds
        seed=seed + i,
        objective="reg:squarederror"        # Regression objective
    )

    model = xgb.fit(train_subset)

    # Getting predictions for the current model
    current_predictions = model.transform(test_df).select(
        "price",
        F.col("prediction").alias(f"pred_{i}")
    )

    # Combining predictions: joining depending on iteration
    if all_predictions is None:
        all_predictions = current_predictions
    else:
        all_predictions = all_predictions.join(current_predictions, on="price")

    # Unpersisting train_subset after training the model
    train_subset.unpersist()

# Calculating average prediction across models
pred_cols = [f"pred_{i}" for i in range(num_models)]
all_predictions = all_predictions.withColumn(
    "final_prediction",
    sum(F.col(col) for col in pred_cols) / len(pred_cols)
)

all_predictions.cache()

# Evaluating using the cached DataFrame
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(all_predictions)

print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating other metrics using the same cached DataFrame
mae = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(all_predictions)
rmse = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(all_predictions)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

train_size = train_df.count()
print(f"\nTraining data size: {train_size} samples")

# Calculating the total runtime
end_time = time.time()
print(f"\nOverall runtime: {(end_time - start_time) / 60:.2f} minutes")


Processing and Training: 100%|██████████| 6/6 [00:12<00:00,  2.07s/it]




Training XGB model 1...
Training XGB model 2...
Training XGB model 3...

R-Squared Score (Accuracy): 93.59%
MAE: 1343.58
RMSE: 1888.64

Training data size: 78736 samples

Overall runtime: 35.65 minutes


In [None]:
from pyspark.sql import functions as F

# function to calculate MAPE
def calculate_mape(df, label_col="price", prediction_col="final_prediction"):
    mape_df = df.withColumn("abs_percentage_error",
                            F.abs((F.col(label_col) - F.col(prediction_col)) / F.col(label_col)))
    mape = mape_df.select(F.mean("abs_percentage_error")).collect()[0][0]
    return mape * 100  # MAPE as a percentage

# function to calculate SMAPE
def calculate_smape(df, label_col="price", prediction_col="final_prediction"):
    smape_df = df.withColumn("symmetric_absolute_percentage_error",
                             2 * F.abs(F.col(label_col) - F.col(prediction_col)) /
                             (F.abs(F.col(label_col)) + F.abs(F.col(prediction_col)))
                            )
    smape = smape_df.select(F.mean("symmetric_absolute_percentage_error")).collect()[0][0]
    return smape * 100  # SMAPE as a percentage

# Calculating MAPE
mape_value = calculate_mape(all_predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape_value:.2f}%")

# Calculating SMAPE
smape_value = calculate_smape(all_predictions)
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {smape_value:.2f}%")


Mean Absolute Percentage Error (MAPE): 10.39%
Symmetric Mean Absolute Percentage Error (SMAPE): 9.74%


In [None]:
from pyspark.sql import functions as F

# Defining error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculating the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = all_predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100
    distribution_results[label] = percentage_in_range

# Printing the distribution table
print("\nError Range Distribution Table for XGBoost Model:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")



Error Range Distribution Table for XGBoost Model:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 63.43     
10-20%          | 24.45     
20-30%          | 7.00      
30-40%          | 2.59      
40-50%          | 1.12      
50-60%          | 0.50      
60-70%          | 0.29      
70-80%          | 0.22      
80-90%          | 0.15      
90-100%         | 0.08      




---



## **300k**

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from sparkxgb import XGBoostRegressor
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Parameters
num_models = 3  # Number of XGBoost models to train
sample_fraction = 0.8  # Fraction of data for each subset
seed = 42  # Seed for reproducibility

start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:
    df_sample = df.sample(fraction=0.1, seed=seed)  # Randomly sample 300k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if needed
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    transformed_df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = transformed_df_sample.randomSplit([0.8, 0.2], seed=seed)
    pbar.update(1)

    # Caching the test data for faster access
    test_df = test_df.cache()
    pbar.update(1)


# Initializing an empty DataFrame for storing combined predictions
all_predictions = None
print("\n")

# Training multiple XGBoost Regressor models on different subsets of training data
for i in range(num_models):
    print(f"Training XGB model {i + 1}...")

    # Sampling a random subset of training data
    train_subset = train_df.sample(fraction=sample_fraction, seed=seed + i)

    # Training XGBoost Regressor model
    xgb = XGBoostRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=6,                   # Maximum depth of each tree
        numRound=100,                 # Number of boosting rounds
        seed=seed + i,
        objective="reg:squarederror"  # Regression objective
    )

    model = xgb.fit(train_subset)

    # Getting predictions for the current model
    current_predictions = model.transform(test_df).select(
        "price",
        F.col("prediction").alias(f"pred_{i}")
    )

    # Combining predictions: joining depending on iteration
    if all_predictions is None:
        all_predictions = current_predictions
    else:
        all_predictions = all_predictions.join(current_predictions, on="price")

    # Unpersisting train_subset after training the model
    train_subset.unpersist()

# Calculating average prediction across models
pred_cols = [f"pred_{i}" for i in range(num_models)]
all_predictions = all_predictions.withColumn(
    "final_prediction",
    sum(F.col(col) for col in pred_cols) / len(pred_cols)
)

all_predictions.cache()

# Evaluating using the cached DataFrame
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(all_predictions)

print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating other metrics using the same cached DataFrame
mae = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(all_predictions)
rmse = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(all_predictions)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

train_size = train_df.count()
print(f"\nTraining data size: {train_size} samples")

# Calculating the total runtime
end_time = time.time()
print(f"\nOverall runtime: {(end_time - start_time) / 60:.2f} minutes")


Processing and Training: 100%|██████████| 6/6 [00:18<00:00,  3.10s/it]

Training XGB model 1...
Training XGB model 2...
Training XGB model 3...

R-Squared Score (Accuracy): 93.06%
MAE: 1290.44
RMSE: 1820.86

Training data size: 240048 samples

Overall runtime: 149.71 minutes


In [None]:
from pyspark.sql import functions as F

# Defining error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculating the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = all_predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100
    distribution_results[label] = percentage_in_range

# Printing the distribution table
print("\nError Range Distribution Table for XGBoost Model:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")



Error Range Distribution Table for XGBoost Model:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 64.02     
10-20%          | 24.23     
20-30%          | 6.95      
30-40%          | 2.35      
40-50%          | 1.03      
50-60%          | 0.53      
60-70%          | 0.27      
70-80%          | 0.16      
80-90%          | 0.12      
90-100%         | 0.09      




---


# **Experimentation**

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from sparkxgb import XGBoostRegressor
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Parameters
num_models = 3  # Number of XGBoost models to train
sample_fraction = 0.8  # Fraction of data for each subset
seed = 42  # Seed for reproducibility

start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:
    df_sample = df.sample(fraction=0.033, seed=seed)  # Randomly sample ~100k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if needed
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    transformed_df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = transformed_df_sample.randomSplit([0.8, 0.2], seed=seed)
    pbar.update(1)

    # Caching the test data for faster access
    test_df = test_df.cache()
    pbar.update(1)


# Initializing an empty DataFrame for storing combined predictions
all_predictions = None
print("\n")

# Training multiple XGBoost Regressor models on different subsets of training data
for i in range(num_models):
    print(f"Training XGB model {i + 1}...")

    # Sampling a random subset of training data
    train_subset = train_df.sample(fraction=sample_fraction, seed=seed + i)

    # Training XGBoost Regressor model
    xgb = XGBoostRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=6,                         # Maximum depth of each tree
        numRound=100,                       # Number of boosting rounds
        seed=seed + i,
        objective="reg:squarederror"        # Regression objective
    )

    model = xgb.fit(train_subset)

    # Getting predictions for the current model
    current_predictions = model.transform(test_df).select(
        "price",
        F.col("prediction").alias(f"pred_{i}")
    )

    # Combining predictions: joining depending on iteration
    if all_predictions is None:
        all_predictions = current_predictions
    else:
        all_predictions = all_predictions.join(current_predictions, on="price")

    # Unpersisting train_subset after training the model
    train_subset.unpersist()

# Calculating average prediction across models
pred_cols = [f"pred_{i}" for i in range(num_models)]
all_predictions = all_predictions.withColumn(
    "final_prediction",
    sum(F.col(col) for col in pred_cols) / len(pred_cols)
)

all_predictions.cache()

# Evaluating using the cached DataFrame
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(all_predictions)

print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating other metrics using the same cached DataFrame
mae = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(all_predictions)
rmse = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(all_predictions)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

train_size = train_df.count()
print(f"\nTraining data size: {train_size} samples")

# Calculating the total runtime
end_time = time.time()
print(f"\nOverall runtime: {(end_time - start_time) / 60:.2f} minutes")


Processing and Training: 100%|██████████| 6/6 [00:11<00:00,  1.92s/it]




Training XGB model 1...
Training XGB model 2...
Training XGB model 3...

R-Squared Score (Accuracy): 93.59%
MAE: 1343.58
RMSE: 1888.64

Training data size: 78736 samples

Overall runtime: 36.05 minutes


In [None]:
from pyspark.sql import functions as F

# Defining error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculating the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = all_predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("final_prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100
    distribution_results[label] = percentage_in_range

# Printing the distribution table
print("\nError Range Distribution Table for XGBoost Model:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")



Error Range Distribution Table for XGBoost Model:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 63.43     
10-20%          | 24.45     
20-30%          | 7.00      
30-40%          | 2.59      
40-50%          | 1.12      
50-60%          | 0.50      
60-70%          | 0.29      
70-80%          | 0.22      
80-90%          | 0.15      
90-100%         | 0.08      


In [None]:
# Define the error range for best performance
best_performance_df = all_predictions.withColumn(
    "error_percentage",
    F.abs((F.col("price") - F.col("final_prediction")) / F.col("price"))
).filter(
    (F.col("error_percentage") >= 0.0) & (F.col("error_percentage") < 0.01)
)

# Join with the original data to bring in all features
best_performance_full_data = best_performance_df.join(
    df, on="price", how="inner"
)

# Select relevant columns to view, including all features and error metrics
best_performance_full_data = best_performance_full_data.select(
    "price", "final_prediction", "error_percentage", *df.columns
)

In [None]:
# Show the data that caused the best-performing predictions
print("Best Performing Data (<=1% Error Range) with Original Features:\n")

# Import necessary PySpark functions
from pyspark.sql import functions as F
pandas_df = best_performance_full_data.sample(fraction=0.1, seed=42).limit(10).toPandas()
display(pandas_df)

Best Performing Data (<=1% Error Range) with Original Features:



Unnamed: 0,price,final_prediction,error_percentage,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price.1,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,9995.0,9914.460612,0.008058,Gasoline,Coupe,Polk City,22.690001,23,50226,2500.0,V6,Red,False,18.030001,65.87,29.469999,241.0,Mixed Colors,False,41.627701,189.8,RED,-93.649696,Ford,5.0,Mustang,9995.0,0,5.0,Nelson Automotive,265.22,A,Automatic,Unknown,111.0,77.2,2007,26.08,80.16,11.08,2,0.03,0.0,18,8,2020,13,16,31,28
1,9995.0,9914.460612,0.008058,Gasoline,Sedan,Wichita,22.0,358,67209,1800.0,I4,Blue,False,15.6,58.1,35.0,138.0,Black,False,37.6642,181.0,BLUE,-97.472099,Chevrolet,5.0,Cruze,9995.0,466,4.8,America's Auto Mall - Wichita,125.0,A,Automatic,Front-Wheel Drive,105.7,70.7,2013,28.5,77.5,11.19,3,1.08,1.60782,19,9,2019,6,13,29,29
2,9995.0,9914.460612,0.008058,Gasoline,Sedan,Sacramento,40.0,59,95825,2000.0,I4,Silver,False,14.0,58.0,36.0,195.0,Black,False,38.5961,191.8,SILVER,-121.413002,Ford,5.0,Fusion Energi,9995.0,203,4.166667,Right Cars Auto Sales,265.22,CVT,Continuously Variable Transmission,Front-Wheel Drive,112.2,72.9,2014,38.0,82.6,11.53,12,0.43,-1e-05,14,7,2020,6,17,35,31
3,9995.0,9914.460612,0.008058,Gasoline,Sedan,Redlands,28.0,41,92373,1800.0,I4,Beige,False,12.8,56.5,38.0,148.0,Mixed Colors,False,34.060101,178.3,BROWN,-117.196999,Hyundai,5.0,Elantra,9995.0,11,5.0,Naders Auto Sales,265.22,A,6-Speed Automatic,Front-Wheel Drive,106.3,69.9,2013,33.0,76.7,10.86,1,0.98,-1e-05,2,8,2020,7,15,29,26
4,9995.0,9914.460612,0.008058,Gasoline,Sedan,Yorktown,27.0,227,47396,2400.0,I4,Other,False,17.200001,57.7,36.0,185.0,Mixed Colors,False,40.171101,191.4,UNKNOWN,-85.500198,Honda,5.0,Accord,9995.0,1037,4.0,Sunset Motors,181.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,109.3,72.8,2014,31.5,81.0,11.78,4,0.29,0.55155,26,1,2020,6,11,33,26
5,9995.0,9914.460612,0.008058,Gasoline,SUV / Crossover,Corpus Christi,23.0,72,78412,2500.0,I4,White,False,15.9,65.3,28.0,170.0,Other,False,27.716499,183.3,WHITE,-97.362198,Nissan,5.0,Rogue,9995.0,761,5.0,Richard Sanchez Motors,175.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,105.9,70.9,2012,25.5,77.8,11.61,1,0.3,0.73244,2,7,2020,8,15,31,25
6,9995.0,9914.460612,0.008058,Gasoline,SUV / Crossover,Delran,22.0,42,8075,2500.0,I4,Purple,False,15.9,66.3,26.0,170.0,Black,False,40.010601,183.3,PURPLE,-74.970398,Nissan,5.0,Rogue,9995.0,637,4.857143,EGR auto sales,175.0,CVT,Continuously Variable Transmission,All-Wheel Drive,105.9,70.9,2012,24.0,77.8,11.49,6,0.3,0.73244,30,7,2020,8,15,31,27
7,9995.0,9914.460612,0.008058,Gasoline,Pickup Truck,Westminster,22.690001,53,80030,5300.0,Gasoline engine,Silver,False,26.0,73.9,29.469999,310.0,Gray,False,39.825699,230.2,SILVER,-105.030998,GMC,6.0,Sierra 1500,9995.0,0,4.0,Choice One Motors,335.0,A,Automatic,Four-Wheel Drive,143.5,78.5,2006,26.08,75.0,12.11,1,1.27,0.45563,21,7,2020,14,14,33,29
8,9995.0,9914.460612,0.008058,Flex Fuel Vehicle,Van,Oklahoma City,11.0,441,73141,5400.0,V8,Silver,False,33.0,83.4,15.0,255.0,Gray,False,35.493,216.7,SILVER,-97.366402,Ford,12.0,E-Series,9995.0,1828,4.178571,A & G Auto Inc.,420.0,A,Automatic,Rear-Wheel Drive,138.0,95.7,2013,13.0,78.8,11.63,1,0.16,0.11994,28,6,2019,6,13,37,28
9,9995.0,9914.460612,0.008058,Gasoline,Sedan,Lincoln,43.0,88,62656,2000.0,I4,Other,True,14.0,58.0,41.0,188.0,Other,False,40.1577,191.8,BLACK,-89.391098,Ford,5.0,Fusion Energi,9995.0,1635,4.75,Lincoln Chrysler Dodge Jeep,129.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,112.2,83.5,2017,42.0,82.6,11.99,7,0.49,0.84935,14,6,2020,3,18,38,31




---



## **Understanding what caused the Best results (<=1% error)**

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Step 1: Calculate average price per brand and assign price category
brand_avg_price_df = best_performance_full_data.groupBy("make_name").agg(F.round(F.mean("price"), 0).alias("avg_price"))
brand_avg_price_df = brand_avg_price_df.withColumn(
    "price_category",
    F.when(F.col("avg_price") < 18450, "Budget")
    .when((F.col("avg_price") >= 18450) & (F.col("avg_price") < 26475), "Mid-range")
    .when((F.col("avg_price") >= 26475) & (F.col("avg_price") < 38210), "High-end")
    .when((F.col("avg_price") >= 38210) & (F.col("avg_price") <= 50506), "Luxury")
    .otherwise("Ultra-luxury")
)

# Collect brands by price category
budget_brands = brand_avg_price_df.filter(F.col("price_category") == "Budget").select("make_name").rdd.flatMap(lambda x: x).collect()
mid_range_brands = brand_avg_price_df.filter(F.col("price_category") == "Mid-range").select("make_name").rdd.flatMap(lambda x: x).collect()
high_end_brands = brand_avg_price_df.filter(F.col("price_category") == "High-end").select("make_name").rdd.flatMap(lambda x: x).collect()
luxury_brands = brand_avg_price_df.filter(F.col("price_category") == "Luxury").select("make_name").rdd.flatMap(lambda x: x).collect()
ultra_luxury_brands = brand_avg_price_df.filter(F.col("price_category") == "Ultra-luxury").select("make_name").rdd.flatMap(lambda x: x).collect()

# Print brands in each category
print("Budget brands:", budget_brands)
print("Mid-range brands:", mid_range_brands)
print("High-end brands:", high_end_brands)
print("Luxury brands:", luxury_brands)
print("Ultra-luxury brands:", ultra_luxury_brands)

# Step 2: Define UDFs and register them
def mileage_score_udf(log_mileage):
    return 10 if log_mileage < 2.3 else 7 if log_mileage < 8.91 else 4 if log_mileage < 10.63 else 1

def age_score_udf(age):
    return 10 if age <= 2 else 7 if age <= 5 else 4 if age <= 10 else 1

def days_in_market_score_udf(days_in_market):
    return 10 if days_in_market < 14 else 7 if days_in_market < 35 else 5 if days_in_market < 82 else 3 if days_in_market < 215 else 1

def make_name_score_udf(make_name):
    return 10 if make_name in ultra_luxury_brands else 8 if make_name in luxury_brands else 7 if make_name in high_end_brands else 5 if make_name in mid_range_brands else 3

def horsepower_cost_udf(horsepower):
    return 3 if horsepower <= 147 else 5 if horsepower <= 176 else 7 if horsepower <= 241 else 8 if horsepower <= 300 else 10 if horsepower <= 375 else 12

def transmission_cost_udf(transmission):
    return 3 if transmission == 'Manual' else 5 if transmission == 'Automatic' else 7 if transmission == 'CVT' else 10 if transmission == 'Dual Clutch' else 5

def fuel_type_cost_udf(fuel_type):
    return 5 if fuel_type == 'Gasoline' else 7 if fuel_type == 'Diesel' else 3 if fuel_type == 'Electric' else 7 if fuel_type == 'Hybrid' else 6 if fuel_type == 'Flex Fuel Vehicle' else 8 if fuel_type == 'Compressed Natural Gas' else 9 if fuel_type == 'Propane' else 5

def engine_type_cost_udf(engine_type):
    return 10 if engine_type in ['W16', 'W12'] else 9 if engine_type in ['V12', 'W8'] else 8 if engine_type in ['V10', 'H6'] else 7 if engine_type in ['V8', 'Diesel engine'] else 6 if engine_type in ['V6', 'H4', 'I6', 'I5'] else 5 if engine_type in ['I4', 'I3', 'Gasoline engine'] else 4 if engine_type in ['I2', 'R2'] else 3 if engine_type == 'Electric Motor' else 5

def manufactured_year_score(manufactured_year):
    return 10 if manufactured_year == 2020 else 8 if manufactured_year >= 2015 else 6 if manufactured_year >= 2005 else 4 if manufactured_year >= 1990 else 2

def seating_score_udf(maximum_seating):
    return 10 if maximum_seating == 2.0 else 9 if maximum_seating == 7.0 else 8 if maximum_seating in [9.0, 8.0] else 6 if maximum_seating in [10.0, 12.0] else 7 if maximum_seating == 6.0 else 6 if maximum_seating == 5.0 else 4 if maximum_seating in [3.0, 4.0] else 3 if maximum_seating == 15.0 else 5

def major_options_score_udf(major_options_count):
    return 3 if major_options_count <= 2 else 4 if major_options_count <= 5 else 5 if major_options_count <= 8 else 6 if major_options_count <= 11 else 7 if major_options_count <= 15 else 8 if major_options_count <= 20 else 9 if major_options_count <= 25 else 10

def transmission_score(transmission_display):
    scores = {3: ['Manual'], 4: ['4-Speed Automatic', '5-Speed Manual'], 5: ['5-Speed Automatic', '6-Speed Manual'],
              6: ['6-Speed Automatic', 'Automatic'], 7: ['Continuously Variable'], 8: ['7-Speed Automatic', '6-Speed Dual Clutch', '1-Speed Automatic'],
              9: ['8-Speed Automatic', '9-Speed Automatic'], 10: ['8-Speed Dual Clutch']}
    for score, transmissions in scores.items():
        if transmission_display in transmissions:
            return score
    return 5

# Registering UDFs
mileage_score = F.udf(mileage_score_udf, T.IntegerType())
age_score = F.udf(age_score_udf, T.IntegerType())
days_in_market_score = F.udf(days_in_market_score_udf, T.IntegerType())
make_name_score = F.udf(make_name_score_udf, T.IntegerType())
horsepower_cost = F.udf(horsepower_cost_udf, T.IntegerType())
transmission_cost = F.udf(transmission_cost_udf, T.IntegerType())
fuel_type_cost = F.udf(fuel_type_cost_udf, T.IntegerType())
engine_type_cost = F.udf(engine_type_cost_udf, T.IntegerType())
manufactured_year_score_udf = F.udf(manufactured_year_score, T.IntegerType())
seating_score = F.udf(seating_score_udf, T.IntegerType())
major_options_score = F.udf(major_options_score_udf, T.IntegerType())
transmission_score_udf = F.udf(transmission_score, T.IntegerType())


Budget brands: ['Volkswagen', 'Lexus', 'Jaguar', 'Saturn', 'FIAT', 'Scion', 'Jeep', 'Mitsubishi', 'Volvo', 'Hyundai', 'Honda', 'INFINITI', 'Austin', 'Cadillac', 'Isuzu', 'Mercedes-Benz', 'Chrysler', 'Lincoln', 'BMW', 'Suzuki', 'AMC', 'Acura', 'Datsun', 'Hillman', 'Hummer', 'Freightliner', 'smart', 'Mazda', 'Eagle', 'Nash', 'Sunbeam', 'Oldsmobile', 'Rover', 'Kia', 'Chevrolet', 'Saab', 'MINI', 'Audi', 'Plymouth', 'Pontiac', 'Geo', 'Land Rover', 'Mercury', 'MG', 'Studebaker', 'Austin-Healey', 'Kaiser', 'Opel', 'Willys', 'GMC', 'Buick', 'Porsche', 'AM General', 'Triumph', 'Toyota', 'Dodge', 'Nissan', 'Ford', 'Subaru']
Mid-range brands: ['Franklin', 'Rolls-Royce', 'Mobility Ventures', 'RAM', 'Alfa Romeo', 'Pininfarina', 'Maserati', 'Bricklin', 'Hudson', 'Packard', 'Lotus', 'Morris']
High-end brands: ['Genesis', 'Bentley', 'Tesla', 'Fisker', 'Aston Martin', 'VPG']
Luxury brands: ['Shelby', 'Ferrari', 'International Harvester', 'Maybach']
Ultra-luxury brands: ['Karma', 'Lamborghini', 'Ariel']

In [None]:
# Applying UDFs to create new columns for each score
best_performance_full_data = best_performance_full_data.withColumn("mileage_score", mileage_score(F.col("log_mileage"))) \
                                                       .withColumn("age_score", age_score(F.col("age"))) \
                                                       .withColumn("days_in_market_score", days_in_market_score(F.col("days_in_market"))) \
                                                       .withColumn("make_name_score", make_name_score(F.col("make_name"))) \
                                                       .withColumn("horsepower_cost", horsepower_cost(F.col("horsepower"))) \
                                                       .withColumn("transmission_cost", transmission_cost(F.col("transmission"))) \
                                                       .withColumn("fuel_type_cost", fuel_type_cost(F.col("fuel_type"))) \
                                                       .withColumn("engine_type_cost", engine_type_cost(F.col("engine_type"))) \
                                                       .withColumn("manufactured_year_score", manufactured_year_score_udf(F.col("manufactured_year"))) \
                                                       .withColumn("seating_score", seating_score(F.col("maximum_seating"))) \
                                                       .withColumn("major_options_score", major_options_score(F.col("major_options_count"))) \
                                                       .withColumn("transmission_score", transmission_score_udf(F.col("transmission_display")))

print("Columns successfully created and added.")


Columns successfully created and added.


### **Numerical coloumns**

In [None]:
# Calculate statistics for numeric columns
numeric_columns = ["mileage_score", "age_score", "days_in_market_score", "horsepower_cost", "transmission_cost", "fuel_type_cost",
                   "engine_type_cost", "transmission_score", "seating_score", "major_options_score", "manufactured_year_score"]

# Collecting statistics for numeric columns
numeric_stats = {}
for col in numeric_columns:
    stats = best_performance_full_data.select(
        F.mean(col).alias("mean"),
        F.stddev(col).alias("stddev"),
        F.min(col).alias("min"),
        F.max(col).alias("max")
    ).collect()[0]

    # Collect most frequent value (mode)
    mode_value = best_performance_full_data.groupBy(col).count().orderBy(F.desc("count")).first()[0]

    numeric_stats[col] = {
        "mean": stats["mean"],
        "stddev": stats["stddev"],
        "min": stats["min"],
        "max": stats["max"],
        "mode": mode_value
    }

# Step 3: Display the collected statistics for each numeric column
print("\nStatistics for Numeric Columns in Best Performing Data:")
for col, stats in numeric_stats.items():
    print(f"\nColumn: {col}")
    print(f" - Mean: {stats['mean']:.2f}")
    print(f" - Std Dev: {stats['stddev']:.2f}")
    print(f" - Min: {stats['min']}")
    print(f" - Max: {stats['max']}")
    print(f" - Mode (most frequent value): {stats['mode']}")


Statistics for Numeric Columns in Best Performing Data:

Column: mileage_score
 - Mean: 2.01
 - Std Dev: 1.75
 - Min: 1
 - Max: 10
 - Mode (most frequent value): 1

Column: age_score
 - Mean: 5.70
 - Std Dev: 2.90
 - Min: 1
 - Max: 10
 - Mode (most frequent value): 4

Column: days_in_market_score
 - Mean: 6.18
 - Std Dev: 2.79
 - Min: 1
 - Max: 10
 - Mode (most frequent value): 5

Column: horsepower_cost
 - Mean: 6.78
 - Std Dev: 2.23
 - Min: 3
 - Max: 12
 - Mode (most frequent value): 7

Column: transmission_cost
 - Mean: 5.26
 - Std Dev: 0.71
 - Min: 5
 - Max: 10
 - Mode (most frequent value): 5

Column: fuel_type_cost
 - Mean: 5.16
 - Std Dev: 0.48
 - Min: 3
 - Max: 9
 - Mode (most frequent value): 5

Column: engine_type_cost
 - Mean: 5.54
 - Std Dev: 0.72
 - Min: 3
 - Max: 10
 - Mode (most frequent value): 5

Column: transmission_score
 - Mean: 5.83
 - Std Dev: 0.93
 - Min: 3
 - Max: 10
 - Mode (most frequent value): 6

Column: seating_score
 - Mean: 6.45
 - Std Dev: 1.23
 - Min: 

### **Categorical coloumns**

In [None]:
# Get categorical columns by checking data types
categorical_columns = [field.name for field in best_performance_full_data.schema.fields if field.dataType == F.StringType()]

# Display the list of categorical columns
print("Categorical Columns:", categorical_columns)


Categorical Columns: ['fuel_type', 'body_type', 'city', 'dealer_zip', 'engine_type', 'exterior_color', 'interior_color', 'listing_color', 'make_name', 'model_name', 'sp_name', 'transmission', 'transmission_display', 'wheel_system_display']


In [None]:
# List of categorical columns to analyze
categorical_columns = ['fuel_type', 'body_type', 'city', 'dealer_zip', 'engine_type', 'exterior_color', 'interior_color', 'listing_color', 'make_name', 'model_name', 'sp_name', 'transmission', 'transmission_display', 'wheel_system_display']

print("Displaying Top 3 Most Frequent Values for the Following Columns (in Best Performing Data - [<=1% error]):")

# Loop over each categorical column and display the top 3 most frequent values
for col in categorical_columns:
    top_values = best_performance_full_data.groupBy(col).count().orderBy(F.desc("count")).limit(3).select(col)
    print(f"\nin '{col}':")
    top_values.show(truncate=False)


Displaying Top 3 Most Frequent Values for the Following Columns (in Best Performing Data - [<=1% error]):

in 'fuel_type':
+-----------------+
|fuel_type        |
+-----------------+
|Gasoline         |
|Flex Fuel Vehicle|
|Hybrid           |
+-----------------+


in 'body_type':
+---------------+
|body_type      |
+---------------+
|SUV / Crossover|
|Sedan          |
|Pickup Truck   |
+---------------+


in 'city':
+-------+
|city   |
+-------+
|Houston|
|El Paso|
|Tucson |
+-------+


in 'dealer_zip':
+----------+
|dealer_zip|
+----------+
|95678     |
|06770     |
|20111     |
+----------+


in 'engine_type':
+-----------+
|engine_type|
+-----------+
|I4         |
|V6         |
|V8         |
+-----------+


in 'exterior_color':
+--------------+
|exterior_color|
+--------------+
|White         |
|Black         |
|Silver        |
+--------------+


in 'interior_color':
+--------------+
|interior_color|
+--------------+
|Black         |
|Other         |
|Gray          |
+--------------