In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.5.2"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"}
]

# Checking and install packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DecisionTreeModel") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.5.3


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Tulsa,15.0,1,74145,4600.0,V8,Other,True,23.0,74.2,19.0,301.0,Red,True,36.1017,192.1,UNKNOWN,-95.886597,Lexus,7.0,GX,58780.0,0,4.294117,Lexus of Tulsa,329.0,A,6-Speed Automatic,All-Wheel Drive,109.8,74.2,2020,17.0,75.8,1.1,10,0.77,0.35639,10,9,2020,0,37,44,38
1,Gasoline,Pickup Truck,Port Clinton,17.0,30,43452,3600.0,V6,Blue,True,21.0,70.6,24.0,308.0,Black,False,41.513599,212.7,BLUE,-82.859299,Chevrolet,5.0,Colorado,32500.0,236,3.727273,Baumann Port Clinton,369.0,A,Automatic,Four-Wheel Drive,128.3,83.9,2018,20.5,80.8,9.99,6,0.34,0.65591,12,8,2020,2,28,43,32
2,Flex Fuel Vehicle,Van,Seneca,22.690001,12,29678,2500.0,V6,White,True,24.74,65.87,29.469999,241.0,Gray,True,34.6633,189.8,WHITE,-82.942001,Ford,5.0,Transit Cargo,38050.0,0,4.368421,Lake Keowee Ford,265.22,A,Automatic,Unknown,111.0,77.2,2020,26.08,80.16,8.91,4,0.03,0.0,29,8,2020,0,31,41,33
3,Gasoline,Sedan,Bay Minette,28.0,13,36507,1800.0,I4,Other,True,13.2,57.3,35.0,132.0,Black,False,30.8396,183.1,UNKNOWN,-87.774902,Toyota,5.0,Corolla,16299.0,695,4.0,Chuck Stevens Chevrolet of Bay Minette,128.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,106.3,69.9,2019,31.5,83.7,10.6,2,1.14,1.65957,28,8,2020,1,29,35,27
4,Gasoline,Sedan,Summerville,20.0,11,29483,3600.0,V6,Black,True,16.0,55.9,30.0,333.0,Black,False,33.031101,182.8,BLACK,-80.159897,Cadillac,5.0,ATS,17100.0,1793,4.222222,Hoover Chrysler Jeep Dodge Ram,285.0,A,8-Speed Automatic,Rear-Wheel Drive,109.3,71.1,2016,25.0,76.0,11.31,6,0.49,0.17676,30,8,2020,4,26,41,36


## **Decision Tree Regressor**

## **Using all the columns**

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track the overall runtime
start_time = time.time()

# Combining processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.2, seed=42)  # Randomly sample 600k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Training Decision Tree Regressor model
    dt = DecisionTreeRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxDepth=15,
    maxBins=128,
    minInstancesPerNode=5,
    minInfoGain=0.01,
    seed=42
    )

    model = dt.fit(train_df)
    pbar.update(1)

# Making predictions
predictions = model.transform(test_df)

# Evaluating the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

# Displaying results
print(f"\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Multiplying R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\n\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 6/6 [1:06:35<00:00, 665.95s/it]



Train size: 480,411 samples
Test size: 120,366 samples


R-Squared Score (Accuracy): 88.38%


Overall runtime: 77 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3161
Mean Squared Error: 38316567
Root Mean Squared Error: 6190


 ## Results explanation

In [None]:
# Feature Importance (for Decision Trees)
feature_importance = model.featureImportances
features_list = pipeline_model.stages[-2].getInputCols()  # Get feature names from VectorAssembler

# Sorting features by their importance in descending order
sorted_features = [feature for feature, importance in sorted(zip(features_list, feature_importance), key=lambda x: x[1], reverse=True)]

# Printing ranked features from highest to lowest
print("Top 10 Features Ranked by Importance (Highest to Lowest):")
for rank, feature in enumerate(sorted_features[:10], 1):  # Limiting to top 10 features
    print(f"{rank}. {feature}")

Top 10 Features Ranked by Importance (Highest to Lowest):
1. maintenance_cost
2. log_mileage
3. horsepower
4. maximum_seating
5. torque
6. sp_name_encoded
7. hp_x_torque
8. fuel_tank_volume
9. width
10. is_new




---



# **Column Sub sampling**

## **with 70% of the columns**

I am performing 10 iterations of training a Decision Tree Regressor by `randomly selecting 70% of the features in each iteration` to assess model performance on different feature subsets. After transforming the selected features through a pipeline (including encoding, scaling, and feature assembly), the model is trained, evaluated (using R², MAE, RMSE), and the best-performing model across all iterations is tracked.

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import random

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track the overall runtime
start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:

    # Sampling and repartitioning data
    df_sample = df.sample(fraction=0.2, seed=42)  # Randomly sample 20% of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn(
        "franchise_dealer",
        F.col("franchise_dealer").cast("int"))

    # Assembling numeric features
    num_columns = [
        col for col in df_sample.columns
        if col != 'price' and col not in cat_columns]

    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    train_df.cache()

    # Defining the evaluators for metrics
    r2_evaluator = RegressionEvaluator(
        labelCol="price",
        predictionCol="prediction",
        metricName="r2")

    mae_evaluator = RegressionEvaluator(
        labelCol="price",
        predictionCol="prediction",
        metricName="mae")

    rmse_evaluator = RegressionEvaluator(
        labelCol="price",
        predictionCol="prediction",
        metricName="rmse")

    pbar.update(1)


best_r2 = -float("inf")
best_mae = float("inf")
best_rmse = float("inf")

for iteration in range(10):
    print(f"\nIteration {iteration + 1}")

    # Made to always choose exactly 70% of the columns
    sampled_features = random.sample(num_columns + cat_columns, int(len(num_columns + cat_columns) * 0.7))

    # Printing the list in a readable format across multiple lines
    columns_per_line = 8
    print("Columns chosen for this iteration:")
    print("[", end="")

    for i in range(len(sampled_features)):
        print(f"'{sampled_features[i]}'", end="")
        if i != len(sampled_features) - 1:
            print(", ", end="")
        if (i + 1) % columns_per_line == 0 and i != len(sampled_features) - 1:
            print("\n ", end="")
    print("]")


    # Reinitializing stages for each iteration
    stages = []

    # Handling categorical features within the loop
    cat_columns_iter = [col for col in sampled_features if col in cat_columns]
    for col_name in cat_columns_iter:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]

    # Updating feature columns based on selected features and encoding
    encoded_columns_iter = [f"{col}_encoded" for col in cat_columns_iter]
    num_columns_iter = [col for col in sampled_features if col in num_columns]

    # Assembling the sampled features
    feature_columns_iter = num_columns_iter + encoded_columns_iter
    assembler = VectorAssembler(inputCols=feature_columns_iter, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline for this iteration
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(train_df)
    train_df_transformed = pipeline_model.transform(train_df)
    test_df_transformed = pipeline_model.transform(test_df)

    # Checking if the scaled_features column exists
    if 'scaled_features' not in train_df_transformed.columns:
        print("Error: scaled_features column was not created.")
        break

    # Training Decision Tree Regressor with fixed parameters
    dt = DecisionTreeRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=15,
        maxBins=128,
        minInstancesPerNode=5,
        minInfoGain=0.01,
        seed=42
    )

    # Training the model
    model = dt.fit(train_df_transformed)

    # Making predictions on the test data
    predictions = model.transform(test_df_transformed)

    # Evaluating metrics
    r2 = r2_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)

    # Printing the metrics for this iteration
    print(f"\nR² (Accuracy): {r2 * 100:.2f}%")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("-" * 40)

    # Tracking the best scores
    if r2 > best_r2:
        best_r2 = r2

    if mae < best_mae:
        best_mae = mae

    if rmse < best_rmse:
        best_rmse = rmse

# Printing the best model results
print(f"Best R² (Accuracy): {best_r2 * 100:.2f}%")
print(f"Best MAE: {best_mae:.2f}")
print(f"Best RMSE: {best_rmse:.2f}")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training:  67%|██████▋   | 4/6 [00:00<00:00, 19.18it/s]



Iteration 1
Columns chosen for this iteration:
['height', 'body_type', 'engine_type', 'length', 'engine_displacement', 'make_name', 'wheel_system_display', 'major_options_count', 
 'combined_fuel_economy', 'listing_color', 'franchise_dealer', 'highway_fuel_economy', 'savings_amount', 'hp_x_engine_disp', 'model_name', 'maintenance_cost', 
 'horsepower', 'listed_day', 'is_new', 'transmission', 'manufactured_year', 'fuel_tank_volume', 'seller_rating', 'log_mileage', 
 'width', 'fuel_type', 'latitude', 'transmission_display', 'resale_value_score', 'longitude', 'age', 'city_fuel_economy']

R² (Accuracy): 88.70%
MAE: 3173.72
RMSE: 6104.23
----------------------------------------

Iteration 2
Columns chosen for this iteration:
['listed_month', 'resale_value_score', 'hp_x_torque', 'hp_x_engine_disp', 'dealer_zip', 'log_mileage', 'city_fuel_economy', 'fuel_type', 
 'exterior_color', 'fuel_tank_volume', 'latitude', 'wheel_system_display', 'transmission_display', 'sp_name', 'seller_rating', 'com