In [1]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.1.1"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.22.4"},
    {"name": "xgboost", "version": None},
    {"name": "sparkxgb", "version": None},
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])


tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.

xgboost is already installed.

sparkxgb is already installed.


In [2]:
!pip install numpy==1.22.4



In [3]:
import numpy
print(numpy.__version__)

1.22.4


In [4]:
!pip install sparkxgb



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import shutil

# Defining local resources directory
local_resources_path = "/resources"
os.makedirs(local_resources_path, exist_ok=True)

# Defining the source paths from mounted Google Drive
xgboost4j_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j_2.12-1.7.6.jar"
xgboost4j_spark_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Defining the destination paths in the instance's local file system
xgboost4j_dest = os.path.join(local_resources_path, "xgboost4j_2.12-1.7.6.jar")
xgboost4j_spark_dest = os.path.join(local_resources_path, "xgboost4j-spark_2.12-1.7.6.jar")

# Copying the files from Google Drive to the local instance
shutil.copyfile(xgboost4j_source, xgboost4j_dest)
shutil.copyfile(xgboost4j_spark_source, xgboost4j_spark_dest)

# Verifying that the files are copied
print(f"Jar Files copied to: {local_resources_path}")
print(os.listdir(local_resources_path))


Jar Files copied to: /resources
['xgboost4j-spark_2.12-1.7.6.jar', 'xgboost4j_2.12-1.7.6.jar']


In [7]:
from pyspark.sql import SparkSession

# Defining the path to the copied jar files in the local instance
jar_files = "/resources/xgboost4j_2.12-1.7.6.jar,/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Initializing Spark session with the JAR files
spark = SparkSession.builder \
    .appName("XGBoostRegressor") \
    .config("spark.driver.memory", "120g") \
    .config("spark.executor.memory", "120g") \
    .config("spark.driver.maxResultSize", "40g") \
    .config("spark.executor.memoryOverhead", "40g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .config("spark.jars", jar_files) \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.1.1


In [8]:
# Test if sparkxgb is loaded properly
try:
    from sparkxgb import XGBoostRegressor

    model = XGBoostRegressor()
    print("sparkxgb loaded successfully!")
except Exception as e:
    print(f"Error loading sparkxgb: {e}")


sparkxgb loaded successfully!


In [9]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [10]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [11]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [12]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying the first 5 rows
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Hiawatha,20.0,28,52233,2000.0,I4,Orange,True,18.799999,67.1,26.0,235.0,Gray,True,42.048901,187.8,ORANGE,-91.684898,Hyundai,5.0,Santa Fe,36843.0,0,4.461538,McGrath Ford Hyundai Kia,260.0,A,8-Speed Automatic,All-Wheel Drive,108.9,74.4,2020,23.0,85.0,8.91,4,0.1,0.00688,13,8,2020,0,26,37,34
1,Gasoline,Sedan,Orlando,20.0,53,32839,3000.0,I6,Gray,False,18.5,57.6,30.0,300.0,Other,False,28.4792,193.1,GRAY,-81.397102,BMW,5.0,5 Series,14550.0,828,5.0,Empire Automotive,300.0,A,Automatic,Rear-Wheel Drive,116.9,82.8,2014,25.0,77.5,11.52,5,0.03,0.19071,19,7,2020,6,18,36,30
2,Gasoline,SUV / Crossover,Helena,24.0,271,59601,1400.0,I4,Black,True,14.0,66.3,29.0,138.0,Black,True,46.606701,167.6,BLACK,-112.014,Chevrolet,5.0,Trax,24788.0,0,4.75,Lithia Chevrolet Buick GMC of Helena,148.0,A,Automatic,All-Wheel Drive,100.6,69.9,2019,26.5,76.5,8.23,5,1.46,1.34409,15,12,2019,0,25,35,31
3,Gasoline,Pickup Truck,Houma,17.0,86,70360,3600.0,V6,Other,True,21.0,70.6,24.0,308.0,Black,True,29.622101,212.4,UNKNOWN,-90.749802,GMC,5.0,Canyon,34124.0,0,4.36,Barker Buick GMC,369.0,A,8-Speed Automatic,Four-Wheel Drive,128.3,83.9,2020,20.5,80.8,2.48,7,0.34,0.65591,17,6,2020,0,27,43,37
4,Gasoline,SUV / Crossover,Mt Juliet,17.0,47,37122,3500.0,V6,White,False,19.200001,68.7,24.0,280.0,Black,False,36.1716,188.1,WHITE,-86.497002,Ford,5.0,Edge,16998.0,1290,4.323077,MotorCars of Nashville,250.0,A,6-Speed Automatic,All-Wheel Drive,112.2,85.8,2017,20.5,83.2,11.37,5,0.16,-0.0516,26,7,2020,3,20,38,31


In [13]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=6, desc="Progress") as pbar:

    df_sample = df.sample(fraction=0.1, seed=42)  # Randomly sample 300k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Filling in missing values
    for col in df_sample.columns:
        if df_sample.schema[col].dataType.typeName() in ["double", "float", "int", "long"]:
            mean_value = df_sample.select(sql_mean(col)).first()[0]
            df_sample = df_sample.na.fill({col: mean_value})
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")


Processing the data...


Progress: 100%|██████████| 6/6 [00:34<00:00,  5.78s/it]



Data preprocessing and splitting completed!





In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from sparkxgb import XGBoostRegressor
import time

# Model training
print("Training XGBoost model...")

xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxDepth=6,
    numRound=100,
    objective="reg:squarederror",
    treeMethod="hist",
)


# Before training
start_time = time.time()

# Training the model
model = xgb_regressor.fit(train_df)

# Making predictions
print("Making predictions...")
predictions = model.transform(test_df)

# Evaluating the model
print("Evaluating the model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

print(f"\nTrain size: {train_df.count()} samples")
print(f"Test size: {test_df.count()} samples")
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\n\nOverall runtime: {round(total_runtime)} minutes.")

Training XGBoost model...
Making predictions...
Evaluating the model...

Train size: 240048 samples
Test size: 59933 samples


R-Squared Score (Accuracy): 91.84%


Overall runtime: 75 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3018
Mean Squared Error: 27751838
Root Mean Squared Error: 5268


In [None]:
# Saving the trained XGBoost model
model.write().overwrite().save("/tmp/xgboost_model")

In [None]:
import os
print(os.listdir("/tmp/xgboost_model"))

['data', 'metadata']


In [None]:
import xgboost as xgb

# Loading the model using the XGBoost native API
native_model = xgb.Booster()
native_model.load_model("/tmp/xgboost_model/data/XGBoostRegressionModel")


In [None]:
# Getting feature importances from the loaded native XGBoost model
importance_dict = native_model.get_score(importance_type='weight')

features_list = pipeline_model.stages[-2].getInputCols()  # Getting the input column names from the VectorAssembler

# Mapping the feature indices (f0, f1, ...) to the actual feature names safely
sorted_importance = [
    (features_list[int(f[1:])], importance)
    for f, importance in importance_dict.items()
    if int(f[1:]) < len(features_list)  # Ensuring the index is within bounds
]

# Sorting by importance
sorted_importance = sorted(sorted_importance, key=lambda x: x[1], reverse=True)

# Printing the top 10 features with their actual names (sorted by importance)
print("Top 10 Features Ranked by Importance (Highest to Lowest)")
for rank, (feature, importance) in enumerate(sorted_importance[:10], 1):
    print(f"{rank}. {feature}")

Top 10 Features Ranked by Importance (Highest to Lowest)
1. log_mileage
2. days_in_market
3. maintenance_cost
4. city_fuel_economy
5. major_options_count
6. latitude
7. manufactured_year
8. luxury_score
9. seller_rating
10. longitude




---



# **Column Sub sampling**

## **with 70% of the columns**

I am performing 10 iterations of training a XGB Regressor by `randomly selecting 70% of the features in each iteration` to assess model performance on different feature subsets. After transforming the selected features through a pipeline (including encoding, scaling, and feature assembly), the model is trained, evaluated (using R², MAE, RMSE), and the best-performing model across all iterations is tracked.

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F
from sparkxgb import XGBoostRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import random

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sampling and repartitioning data
    df_sample = df.sample(fraction=0.1, seed=42)  # Randomly sample 10% of the data (~300k records)
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling numeric features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    train_df.cache()

    # Defining evaluators for metrics
    r2_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
    mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
    rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

    pbar.update(1)

# Initializing best scores
best_r2 = -float("inf")
best_mae = float("inf")
best_rmse = float("inf")

for iteration in range(10):
    print(f"\nIteration {iteration + 1}")

    # Randomly choosing exactly 70% of the columns
    sampled_features = random.sample(num_columns + cat_columns, int(len(num_columns + cat_columns) * 0.7))

    # Printing the list in a readable format across multiple lines
    columns_per_line = 8
    print("Columns chosen for this iteration:")
    print("[", end="")

    for i in range(len(sampled_features)):
        print(f"'{sampled_features[i]}'", end="")
        if i != len(sampled_features) - 1:
            print(", ", end="")
        if (i + 1) % columns_per_line == 0 and i != len(sampled_features) - 1:
            print("\n ", end="")
    print("]")

    # Reinitializing stages for each iteration
    stages = []

    # Handling categorical features within the loop
    cat_columns_iter = [col for col in sampled_features if col in cat_columns]
    for col_name in cat_columns_iter:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]

    # Updating feature columns based on selected features and encoding
    encoded_columns_iter = [f"{col}_encoded" for col in cat_columns_iter]
    num_columns_iter = [col for col in sampled_features if col in num_columns]

    # Assembling the sampled features
    feature_columns_iter = num_columns_iter + encoded_columns_iter
    assembler = VectorAssembler(inputCols=feature_columns_iter, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline for this iteration
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(train_df)
    train_df_transformed = pipeline_model.transform(train_df)
    test_df_transformed = pipeline_model.transform(test_df)

    # Checking if the scaled_features column exists
    if 'scaled_features' not in train_df_transformed.columns:
        print("Error: scaled_features column was not created.")
        break

    # Training XGBoost Regressor with fixed parameters
    xgb_regressor = XGBoostRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=6,
        numRound=100,
        objective="reg:squarederror",
        treeMethod="hist",
    )

    # Training the model
    model = xgb_regressor.fit(train_df_transformed)

    # Making predictions on the test data
    predictions = model.transform(test_df_transformed)

    # Evaluating metrics
    r2 = r2_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)

    # Printing the metrics for this iteration
    print(f"\nR² (Accuracy): {r2 * 100:.2f}%")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("-" * 40)

    # Tracking the best scores
    if r2 > best_r2:
        best_r2 = r2

    if mae < best_mae:
        best_mae = mae

    if rmse < best_rmse:
        best_rmse = rmse

# Printing the best model results
print(f"Best R² (Accuracy): {best_r2 * 100:.2f}%")
print(f"Best MAE: {best_mae:.2f}")
print(f"Best RMSE: {best_rmse:.2f}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60  #
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training:  67%|██████▋   | 4/6 [00:00<00:00, 15.35it/s]



Iteration 1
Columns chosen for this iteration:
['listed_month', 'height', 'torque', 'model_name', 'franchise_dealer', 'log_mileage', 'hp_x_engine_disp', 'fuel_tank_volume', 
 'length', 'horsepower', 'dealer_zip', 'resale_value_score', 'transmission_display', 'major_options_count', 'wheel_system_display', 'fuel_type', 
 'seller_rating', 'hp_x_torque', 'maintenance_cost', 'engine_displacement', 'is_new', 'highway_fuel_economy', 'city', 'listed_year', 
 'make_name', 'city_fuel_economy', 'transmission', 'longitude', 'age', 'legroom', 'exterior_color', 'interior_color']

R² (Accuracy): 92.09%
MAE: 3046.73
RMSE: 5184.83
----------------------------------------

Iteration 2
Columns chosen for this iteration:
['width', 'engine_type', 'manufactured_year', 'luxury_score', 'listed_year', 'dealer_zip', 'maximum_seating', 'body_type', 
 'transmission_display', 'highway_fuel_economy', 'length', 'city', 'horsepower', 'torque', 'maintenance_cost', 'resale_value_score', 
 'major_options_count', 'city_

## Key Observations:

1. **Iteration Results:**
   - Each iteration involved selecting approximately 70% of available features, resulting in different combinations being used in the model. This variability enhanced the model's training process and allowed for a more comprehensive exploration of feature importance.
   - The **R²** (coefficient of determination) values, which measure the proportion of variance in the target variable explained by the model, showed generally high accuracy across iterations, with **values ranging from 89.45% to 92.09%**.

2. **Overall Performance:**
   - The overall performance of the model was strong, with consistent accuracy and error metrics across multiple iterations. This indicates stability in model performance, regardless of feature selection variability. Such stability suggests that the model can generalize well to unseen data.
   
3. **Feature Redundancy:**
   - From my perspective, the 1-2% difference in performance when using 70% of columns compared to 100% suggests that many of my features likely contain redundant or correlated information. In other words, some features might be providing similar information, which means that I can achieve comparable predictive performance without needing all the original columns.

4. **Feature Importance:**
   - The fact that the random selections of 70% consistently capture my most important features indicates a robust model design. It suggests that the 30% left out in each iteration likely don’t contribute much unique predictive value.








---



In [None]:
import pandas as pd
import numpy as np

df_sample = df.sample(fraction=0.1, seed=42)

# Computing correlation matrix for numeric columns
numeric_cols = [col for col in df_sample.columns if df_sample.schema[col].dataType.typeName() in ["double", "float", "int"]]

# Assembling numeric columns for correlation calculation
vector_assembler = VectorAssembler(inputCols=numeric_cols, outputCol="corr_features")
df_corr = vector_assembler.transform(df_sample).select("corr_features")

# Converting into a pandas DataFrame for easier correlation calculation
corr_df = pd.DataFrame(df_corr.toPandas()["corr_features"].apply(lambda x: x.toArray()).tolist(), columns=numeric_cols)

# Calculating the correlation matrix
corr_matrix = corr_df.corr()

# Rounding the values to 2 decimal places for cleaner output
corr_matrix_rounded = corr_matrix.round(2)

# Printing the correlation matrix with pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.precision', 2)

print("\nCorrelation Matrix:")
print(corr_matrix_rounded)


Correlation Matrix:
                       city_fuel_economy  engine_displacement  fuel_tank_volume  height  highway_fuel_economy  horsepower  latitude  length  longitude  maximum_seating  price  seller_rating  torque  wheelbase  width  combined_fuel_economy  legroom  log_mileage  hp_x_engine_disp  hp_x_torque
city_fuel_economy                   1.00                -0.41             -0.51   -0.33                  0.94       -0.44     -0.04   -0.31      -0.08            -0.23  -0.15          -0.04   -0.28      -0.30  -0.32                   0.99    -0.12        -0.10             -0.11        -0.10
engine_displacement                -0.41                 1.00              0.79    0.59                 -0.47        0.83     -0.00    0.78      -0.00             0.33   0.34           0.03    0.73       0.76   0.47                  -0.45     0.33         0.09              0.63         0.43
fuel_tank_volume                   -0.51                 0.79              1.00    0.69                

## **with 50% of the coloumns**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F
from sparkxgb import XGBoostRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import random
import numpy as np

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

# Using the precomputed correlation matrix for numerical columns
corr_matrix = corr_matrix_rounded

with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sampling and repartitioning data
    df_sample = df.sample(fraction=0.1, seed=42)  # Randomly sample 10% of the data (~300k records)
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    train_df.cache()

    # Defining evaluators for metrics
    r2_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
    mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
    rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
    pbar.update(1)

# Initializing best scores
best_r2 = -float("inf")
best_mae = float("inf")
best_rmse = float("inf")

# Function to select 10 low-correlation numerical features excluding 'price'
def select_low_corr_numerical_features(corr_matrix, num_features, threshold):
    """ Selects a set of numerical features with low pairwise correlation. """
    selected_features = []
    available_features = [col for col in corr_matrix.columns if col != 'price']

    # Randomly selecting the first feature
    first_feature = random.choice(available_features)
    selected_features.append(first_feature)
    available_features.remove(first_feature)

    print(f"First selected numerical feature: {first_feature}")

    # Selecting other features with low correlation to already selected ones
    while len(selected_features) < num_features and available_features:

        for feature in available_features[:]:  # Looping through a copy of available features
            correlations = [abs(corr_matrix[feature][selected]) for selected in selected_features]

            # Checking if correlation with all selected features is below the threshold
            if all(corr < threshold for corr in correlations):
                selected_features.append(feature)
                available_features.remove(feature)
                print(f"Numerical Feature '{feature}' added.")

            # Breaking early if enough features are selected
            if len(selected_features) >= num_features:
                break

    # If not enough features could be selected, returning what is available
    if len(selected_features) < num_features:
        print(f"\nWarning: Only {len(selected_features)} numerical features could be selected with the given threshold.")

    return selected_features

# Function to randomly select 13 categorical features
def select_random_categorical_features(cat_columns, num_features):
    """ Randomly selects a set of categorical features. """
    selected_cat_features = random.sample(cat_columns, min(num_features, len(cat_columns)))
    print(f"\nRandomly selected categorical features: {selected_cat_features}")
    return selected_cat_features

for iteration in range(5):
    print(f"\nIteration {iteration + 1}")

    # Selecting 10 numerical features with low pairwise correlation
    sampled_num_features = select_low_corr_numerical_features(corr_matrix, 10, 0.75)

    # Randomly selecting 13 categorical features
    sampled_cat_features = select_random_categorical_features(cat_columns, 13)

    # Combining the numerical and categorical features
    sampled_features = sampled_num_features + sampled_cat_features

    # Printing the list in a readable format across multiple lines
    columns_per_line = 8
    print("\nColumns chosen for this iteration:")
    print("[", end="")

    for i in range(len(sampled_features)):
        print(f"'{sampled_features[i]}'", end="")
        if i != len(sampled_features) - 1:
            print(", ", end="")
        if (i + 1) % columns_per_line == 0 and i != len(sampled_features) - 1:
            print("\n ", end="")
    print("]")
    print("Number of columns chosen:", len(sampled_features))

    # Reinitializing stages for each iteration
    stages = []

    # Handling categorical features within the loop
    cat_columns_iter = [col for col in sampled_features if col in cat_columns]
    for col_name in cat_columns_iter:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]

    # Updating feature columns based on selected features and encoding
    encoded_columns_iter = [f"{col}_encoded" for col in cat_columns_iter]
    num_columns_iter = [col for col in sampled_features if col in corr_matrix.columns]

    # Assembling the sampled features
    feature_columns_iter = num_columns_iter + encoded_columns_iter
    assembler = VectorAssembler(inputCols=feature_columns_iter, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline for this iteration
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(train_df)
    train_df_transformed = pipeline_model.transform(train_df)
    test_df_transformed = pipeline_model.transform(test_df)

    # Checking if the scaled_features column exists
    if 'scaled_features' not in train_df_transformed.columns:
        print("Error: scaled_features column was not created.")
        break

    # Training XGBoost Regressor with fixed parameters
    xgb_regressor = XGBoostRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=6,
        numRound=100,
        objective="reg:squarederror",
        treeMethod="hist",
    )

    # Training the model
    model = xgb_regressor.fit(train_df_transformed)

    # Making predictions on the test data
    predictions = model.transform(test_df_transformed)

    # Evaluating metrics
    r2 = r2_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)

    # Printing the metrics for this iteration
    print(f"\nR² (Accuracy): {r2 * 100:.2f}%")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("-" * 40)

    # Tracking the best scores
    if r2 > best_r2:
        best_r2 = r2

    if mae < best_mae:
        best_mae = mae

    if rmse < best_rmse:
        best_rmse = rmse

# Printing the best model results
print(f"Best R² (Accuracy): {best_r2 * 100:.2f}%")
print(f"Best MAE: {best_mae:.2f}")
print(f"Best RMSE: {best_rmse:.2f}")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training:  67%|██████▋   | 4/6 [00:00<00:00, 126.27it/s]



Iteration 1
First selected numerical feature: hp_x_engine_disp
Numerical Feature 'city_fuel_economy' added.
Numerical Feature 'engine_displacement' added.
Numerical Feature 'height' added.
Numerical Feature 'latitude' added.
Numerical Feature 'longitude' added.
Numerical Feature 'maximum_seating' added.
Numerical Feature 'seller_rating' added.
Numerical Feature 'torque' added.
Numerical Feature 'width' added.

Randomly selected categorical features: ['body_type', 'listing_color', 'exterior_color', 'sp_name', 'city', 'wheel_system_display', 'fuel_type', 'transmission_display', 'interior_color', 'model_name', 'engine_type', 'transmission', 'make_name']

Columns chosen for this iteration:
['hp_x_engine_disp', 'city_fuel_economy', 'engine_displacement', 'height', 'latitude', 'longitude', 'maximum_seating', 'seller_rating', 
 'torque', 'width', 'body_type', 'listing_color', 'exterior_color', 'sp_name', 'city', 'wheel_system_display', 
 'fuel_type', 'transmission_display', 'interior_color',