In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.5.2"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"}
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

pyspark is NOT installed. Installing now...
pyspark installation completed.

gdown is already installed.

numpy is already installed.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DecisionTreeModel") \
    .config("spark.driver.memory", "150g") \
    .config("spark.executor.memory", "150g") \
    .config("spark.driver.maxResultSize", "50g") \
    .config("spark.executor.memoryOverhead", "50g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.5.2


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,Pickup Truck,Dacono,17.0,8,80514,3500.0,V6,Silver,True,23.0,76.9,23.0,375.0,Black,False,40.086498,231.9,SILVER,-104.976997,Ford,6.0,F-150,43599.0,2079,4.581395,Interstate Ford,470.0,A,Automatic,Four-Wheel Drive,145.0,96.8,2017,20.0,87.5,10.18,11,0.62,2.72978,3,9,2020,3,28,40,34
1,Gasoline,SUV / Crossover,Marshfield,21.0,6,54449,2000.0,I4,Silver,True,18.5,68.3,28.0,250.0,Black,True,44.692501,188.8,SILVER,-90.153397,Ford,5.0,Edge,29985.0,0,4.65,V&H Automotive Inc.,280.0,A,8-Speed Automatic,All-Wheel Drive,112.2,85.8,2020,24.5,83.2,1.1,4,-0.02,0.00372,5,9,2020,0,37,40,36
2,Gasoline,SUV / Crossover,Spring,20.0,62,77386,3500.0,V6,Black,True,19.200001,67.7,27.0,295.0,Mixed Colors,False,30.1157,192.5,BLACK,-95.437202,Lexus,5.0,RX,43596.0,1465,4.727273,Northside Lexus,267.0,A,8-Speed Automatic,Front-Wheel Drive,109.8,74.6,2019,23.5,82.1,9.43,8,0.23,0.00883,11,7,2020,1,26,41,35
3,Gasoline,Sedan,Naperville,28.0,12,60540,2500.0,I4,Blue,True,16.0,56.9,39.0,203.0,Blue,True,41.7757,192.7,BLUE,-88.1847,Toyota,5.0,Camry,28733.0,0,4.517241,Toyota of Naperville,265.22,A,Automatic,Front-Wheel Drive,111.2,72.4,2020,33.5,80.1,1.61,10,0.17,-1e-05,29,8,2020,0,35,37,33
4,Gasoline,SUV / Crossover,Charlotte,22.0,91,28214,2000.0,I4,White,True,14.9,65.4,28.0,252.0,Black,False,35.282902,182.3,WHITE,-80.968002,GMC,5.0,Terrain,25000.0,2060,3.481928,Victory Chevrolet,260.0,A,Automatic,Front-Wheel Drive,107.3,72.4,2020,25.0,80.6,9.02,8,-0.04,-0.00241,11,6,2020,0,24,40,34




---



# **Decision Trees**

## **Bagging**

## **300k records**

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

warnings.filterwarnings('ignore')

# Parameters
num_models = 3  # Number of Decision Tree models to train
sample_fraction = 0.8  # Fraction of data for each subset
seed = 42  # Seed for reproducibility

start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:
    df_sample = df.sample(fraction=0.1, seed=seed)  # Randomly sample ~300k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if needed
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    transformed_df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = transformed_df_sample.randomSplit([0.8, 0.2], seed=seed)
    pbar.update(1)

    # Caching the test data for faster access
    test_df = test_df.cache()
    pbar.update(1)

# Initializing an empty DataFrame for storing combined predictions
all_predictions = None

print("\n")

# Training multiple Decision Tree Regressor models on different subsets of training data
for i in range(num_models):
    print(f"Training model {i + 1}...")

    # Sampling a random subset of training data
    train_subset = train_df.sample(fraction=sample_fraction, seed=seed + i)

    # Training Decision Tree Regressor model
    dt = DecisionTreeRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=15,
        maxBins=128,
        minInstancesPerNode=5,
        minInfoGain=0.01,
        seed=seed + i
    )

    model = dt.fit(train_subset)

    # Getting predictions for the current model
    current_predictions = model.transform(test_df).select(
        "price",
        F.col("prediction").alias(f"pred_{i}")
    )

    # Combining predictions: joining depending on iteration
    if all_predictions is None:
        all_predictions = current_predictions
    else:
        all_predictions = all_predictions.join(current_predictions, on="price")

    # Unpersisting train_subset after training the model
    train_subset.unpersist()

# Calculating average prediction across models
pred_cols = [f"pred_{i}" for i in range(num_models)]
all_predictions = all_predictions.withColumn(
    "final_prediction",
    sum(F.col(col) for col in pred_cols) / len(pred_cols)
)


all_predictions.cache()

# Evaluating using the cached DataFrame
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(all_predictions)

print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating other metrics using the same cached DataFrame
mae = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(all_predictions)
rmse = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(all_predictions)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

end_time = time.time()
print(f"\nOverall runtime: {(end_time - start_time) / 60:.2f} minutes")



Processing and Training: 100%|██████████| 6/6 [00:18<00:00,  3.05s/it]


Training model 1...
Training model 2...
Training model 3...

R-Squared Score (Accuracy): 89.11%
MAE: 1499.45
RMSE: 2280.79

Overall runtime: 207.28 minutes




---



## **600k records**

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

warnings.filterwarnings('ignore')

# Parameters
num_models = 3  # Number of Decision Tree models to train
sample_fraction = 0.8  # Fraction of data for each subset
seed = 42  # Seed for reproducibility

start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:
    df_sample = df.sample(fraction=0.2, seed=seed)  # Randomly sample ~600k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if needed
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    transformed_df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = transformed_df_sample.randomSplit([0.8, 0.2], seed=seed)
    pbar.update(1)

    # Caching the test data for faster access
    test_df = test_df.cache()
    pbar.update(1)

# Initializing an empty DataFrame for storing combined predictions
all_predictions = None

print("\n")

# Training multiple Decision Tree Regressor models on different subsets of training data
for i in range(num_models):
    print(f"Training model {i + 1}...")

    # Sampling a random subset of training data
    train_subset = train_df.sample(fraction=sample_fraction, seed=seed + i)

    # Training Decision Tree Regressor model
    dt = DecisionTreeRegressor(
        featuresCol="scaled_features",
        labelCol="price",
        maxDepth=15,
        maxBins=128,
        minInstancesPerNode=5,
        minInfoGain=0.01,
        seed=seed + i
    )

    model = dt.fit(train_subset)

    # Getting predictions for the current model
    current_predictions = model.transform(test_df).select(
        "price",
        F.col("prediction").alias(f"pred_{i}")
    )

    # Combining predictions: joining depending on iteration
    if all_predictions is None:
        all_predictions = current_predictions
    else:
        all_predictions = all_predictions.join(current_predictions, on="price")

    # Unpersisting train_subset after training the model
    train_subset.unpersist()

# Calculating average prediction across models
pred_cols = [f"pred_{i}" for i in range(num_models)]
all_predictions = all_predictions.withColumn(
    "final_prediction",
    sum(F.col(col) for col in pred_cols) / len(pred_cols)
)


all_predictions.cache()

# Evaluating using the cached DataFrame
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(all_predictions)

print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating other metrics using the same cached DataFrame
mae = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(all_predictions)
rmse = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(all_predictions)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

end_time = time.time()
print(f"\nOverall runtime: {(end_time - start_time) / 60:.2f} minutes")



Processing and Training: 100%|██████████| 6/6 [00:24<00:00,  4.08s/it]

Training model 1...
Training model 2...
Training model 3...

R-Squared Score (Accuracy): 89.46%
MAE: 1355.72
RMSE: 2292.60

Overall runtime: 1122.50 minutes
