In [1]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.5.2"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.23.5"}
]

# Checking and installing the packages
for package in packages:
    check_and_install_package(package["name"], package["version"])



tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DecisionTreeModel") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.5.3


# **Predictions BEFORE Feature Engineering**

In [None]:
# loading the df

!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Processed_DF.parquet' /content/

output_path = '/content/Processed_DF.parquet'
df = spark.read.parquet(output_path)
print("The Processed DataFrame has been loaded successfully.")


The Processed DataFrame has been loaded successfully.


In [None]:
df = df.repartition(100)  # Repartitioning into 100 partitions for parallelism

In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 42)


In [None]:
from functools import reduce
from pyspark.sql.functions import col

# Checking for rows with at least one null value in any column
rows_with_null = df.filter(
    reduce(lambda x, y: x | y, (col(c).isNull() for c in df.columns))
).count()

print(f"Number of rows with at least one null value: {rows_with_null}")

Number of rows with at least one null value: 0


### **Handling Categorical Coloumns**



In [None]:
df=df.drop('description','major_options','mileage')
# Keeping the columns ['exterior_color','dealer_zip','interior_color']

In [None]:
# Counting unique values in 'exterior_color' and 'interior_color' columns
exterior_colors_count = df.select('exterior_color').distinct().count()
interior_colors_count = df.select('interior_color').distinct().count()

print(f"Unique exterior colors: {exterior_colors_count}")
print(f"Unique interior colors: {interior_colors_count}")

Unique exterior colors: 23036
Unique interior colors: 38528


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

# Listing top colors for both exterior and interior
colors = ['White', 'Black', 'Gray', 'Silver', 'Red', 'Blue', 'Brown', 'Green', 'Beige', 'Orange', 'Gold', 'Yellow', 'Purple']

# Creating a UDF to find colors in the color columns
@F.udf(returnType=ArrayType(StringType()))
def find_colors(color_string):
    if color_string is None or color_string.strip() == "":
        return ["Other"]  # Handle empty or null values
    found_colors = [c for c in colors if c.lower() in color_string.lower()]
    return found_colors if found_colors else ["Other"]  # Label non-matching colors as "Other"

# Applying the UDF to both the exterior and interior color columns
df = df.withColumn("exterior_color_array", find_colors("exterior_color"))
df = df.withColumn("interior_color_array", find_colors("interior_color"))

# Creating a column with the count of colors found for both exterior and interior
df = df.withColumn("exterior_color_count", F.size("exterior_color_array"))
df = df.withColumn("interior_color_count", F.size("interior_color_array"))

# Joining the color arrays into string columns
df = df.withColumn("exterior_color", F.array_join("exterior_color_array", ", "))
df = df.withColumn("interior_color", F.array_join("interior_color_array", ", "))

# Labeling mixed colors for both exterior and interior colors
df = df.withColumn(
    "exterior_color",
    F.when(F.col("exterior_color_count") > 1, "Mixed Colors")
     .otherwise(F.col("exterior_color")))

df = df.withColumn(
    "interior_color",
    F.when(F.col("interior_color_count") > 1, "Mixed Colors")
     .otherwise(F.col("interior_color")))

# Dropping temporary columns
df = df.drop("exterior_color_array", "exterior_color_count", "interior_color_array", "interior_color_count")


In [None]:
# Counting the occurrences of each exterior and interior color and calculating percentages
exterior_color_counts = df.groupBy("exterior_color").count().withColumn(
    "percentage", F.round((F.col("count") / df.count()) * 100, 2))

interior_color_counts = df.groupBy("interior_color").count().withColumn(
    "percentage", F.round((F.col("count") / df.count()) * 100, 2))

# Showing the results
print("Exterior Color Distribution:")
exterior_color_counts.orderBy(F.desc("count")).show(truncate=False)

Exterior Color Distribution:
+--------------+------+----------+
|exterior_color|count |percentage|
+--------------+------+----------+
|White         |675979|22.53     |
|Black         |580148|19.34     |
|Other         |543638|18.12     |
|Silver        |384540|12.82     |
|Blue          |253263|8.44      |
|Red           |242331|8.08      |
|Gray          |231172|7.71      |
|Green         |23026 |0.77      |
|Mixed Colors  |19728 |0.66      |
|Brown         |12905 |0.43      |
|Orange        |11638 |0.39      |
|Gold          |10544 |0.35      |
|Beige         |5065  |0.17      |
|Yellow        |4855  |0.16      |
|Purple        |1208  |0.04      |
+--------------+------+----------+



In [None]:
print("Interior Color Distribution:")
interior_color_counts.orderBy(F.desc("count")).show(truncate=False)

Interior Color Distribution:
+--------------+-------+----------+
|interior_color|count  |percentage|
+--------------+-------+----------+
|Black         |1624033|54.13     |
|Other         |577578 |19.25     |
|Gray          |383966 |12.8      |
|Mixed Colors  |171212 |5.71      |
|White         |91545  |3.05      |
|Brown         |65943  |2.2       |
|Red           |34117  |1.14      |
|Silver        |24124  |0.8       |
|Blue          |22828  |0.76      |
|Green         |2048   |0.07      |
|Gold          |1193   |0.04      |
|Orange        |1133   |0.04      |
|Yellow        |134    |0.0       |
|Purple        |121    |0.0       |
|Beige         |65     |0.0       |
+--------------+-------+----------+



In [None]:
print(f"Final processed DataFrame used for the model has {df.count()} rows and {len(df.columns)} columns.")

Final processed DataFrame used for the model has 3000040 rows and 39 columns.


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count
0,Flex Fuel Vehicle,Minivan,Weatherford,22.690001,42,73096,3600.0,V6,Black,True,20.0,69.0,29.469999,283.0,Black,False,35.534401,203.7,2020-07-31,BLACK,-98.6558,Dodge,7.0,Grand Caravan,18699.0,894,4.333333,Cummins Chrysler,260.0,A,4-Speed Automatic,Front-Wheel Drive,121.2,88.5,2019,26.08,77.2,10.78,9
1,Gasoline,Sedan,Elmhurst,28.0,39,60126,1800.0,I4,Black,True,13.2,57.3,36.0,132.0,Black,False,41.918499,183.1,2020-08-02,BLACK,-87.9533,Toyota,5.0,Corolla,14779.0,644,4.2,Elmhurst Toyota,128.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,106.3,69.9,2017,32.0,83.7,10.61,4
2,Gasoline,Sedan,Duarte,16.0,16,91010,2500.0,H4,Other,False,15.9,58.1,22.0,310.0,Black,False,34.135899,180.9,2020-08-27,UNKNOWN,-117.980003,Subaru,5.0,WRX STI,38998.0,925,4.45,CarMax San Gabriel Valley/Duarte - Now offerin...,290.0,M,Manual,All-Wheel Drive,104.3,79.5,2020,19.0,78.7,9.27,6
3,Gasoline,SUV / Crossover,Penn Yan,18.0,70,14527,3600.0,V6,Red,True,24.6,69.3,25.0,295.0,Black,False,42.6408,189.8,2020-07-02,RED,-77.044998,Jeep,5.0,Grand Cherokee,28899.0,778,4.875,Friendly Dodge Chrysler Jeep Incorporated,260.0,A,8-Speed Automatic,Four-Wheel Drive,114.8,84.8,2018,21.5,78.9,10.37,11
4,Gasoline,Hatchback,Clinton Township,22.690001,29,48036,1500.0,I4,Black,False,11.1,59.4,29.469999,106.0,Other,False,42.575298,153.5,2020-08-11,BLACK,-82.888802,Toyota,5.0,Yaris,8222.0,537,3.6,Moran Used Car Outlet,103.0,A,4-Speed Automatic Overdrive,Front-Wheel Drive,98.8,66.7,2014,26.08,73.9,11.1,0




---



## **Decision Tree Regressor**

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track the overall runtime
start_time = time.time()

# Combine processing data and model training in the same progress bar
with tqdm(total=6, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.2, seed=42)  # Randomly sample 600k records of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Training the Decision Tree Regressor model
    dt = DecisionTreeRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxDepth=15,
    maxBins=128,
    minInstancesPerNode=5,
    minInfoGain=0.01,
    seed=42
    )

    model = dt.fit(train_df)
    pbar.update(1)


# Making predictions
predictions = model.transform(test_df)

# Evaluating the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

# Displaying results
print(f"\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Multiplying R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 6/6 [53:39<00:00, 536.61s/it]

Train size: 480,538 samples
Test size: 120,299 samples


R-Squared Score (Accuracy): 82.74%


Overall runtime: 62 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)


print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3265
Mean Squared Error: 64616687
Root Mean Squared Error: 8038


The Mean Absolute Error (`MAE`) of `$3,265` indicates that, on average, the predicted car prices deviate from the actual prices by this amount. Considering that the mean car price is **\$29,933**, this error accounts for approximately `10.9% of the mean price`, suggesting that while the model demonstrates reasonable performance, it exhibits a slightly higher error compared to other models.

In [None]:
# Feature Importance (for Decision Trees)
feature_importance = model.featureImportances
features_list = pipeline_model.stages[-2].getInputCols()  # Get feature names from VectorAssembler

# Sorting features by their importance in descending order
sorted_features = [feature for feature, importance in sorted(zip(features_list, feature_importance), key=lambda x: x[1], reverse=True)]

# Printing ranked features from highest to lowest
print("Top 10 Features Ranked by Importance (Highest to Lowest):")
for rank, feature in enumerate(sorted_features[:10], 1):  # Limiting to top 10 features
    print(f"{rank}. {feature}")

Top 10 Features Ranked by Importance (Highest to Lowest):
1. horsepower
2. log_mileage
3. year
4. engine_displacement
5. torque
6. maximum_seating
7. make_name_encoded
8. legroom
9. length
10. fuel_tank_volume


In [None]:
import pyspark

# Getting the list of all variables in the current notebook environment
for var_name in dir():
    # Checking if the variable is a Spark DataFrame
    if isinstance(eval(var_name), pyspark.sql.dataframe.DataFrame):
        # Deleting the variables
        del globals()[var_name]

# Performing garbage collection
import gc
_ = gc.collect()
print("All DataFrames have been deleted.")


spark.catalog.clearCache()
spark.stop()
print("Spark Stopped !")

All DataFrames have been deleted.
Spark Stopped !




---



# **Predictions AFTER Feature Engineering**

In [4]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [5]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [6]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [7]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,Coupe,Birmingham,23.0,175,35210,3000.0,I6,White,True,18.0,52.8,30.0,335.0,Mixed Colors,True,33.5224,191.1,WHITE,-86.664497,BMW,4.0,8 Series,92545.0,0,4.27907,BMW of Birmingham,368.0,A,Automatic,Rear-Wheel Drive,111.1,84.1,2020,26.5,71.6,1.39,4,0.04,0.93994,19,3,2020,0,31,44,32
1,Gasoline,SUV / Crossover,Indianapolis,21.0,209,46268,2000.0,I4,Red,True,19.4,67.0,27.0,230.0,Black,True,39.924702,191.4,RED,-86.230103,Chevrolet,5.0,Blazer,29921.0,0,3.48,Bill Estes Chevrolet,270.0,A,Automatic,Front-Wheel Drive,112.7,76.7,2020,24.0,80.6,2.3,8,0.14,-0.0088,14,2,2020,0,27,39,34
2,Gasoline,Pickup Truck,Starke,22.690001,28,32091,6400.0,V8,Black,True,32.0,80.2,29.469999,410.0,Black,True,29.927799,238.8,BLACK,-82.122704,RAM,5.0,3500,64355.0,0,4.291667,Murray Chrysler Dodge Jeep Ram,850.0,A,8-Speed Automatic,Four-Wheel Drive,149.0,83.5,2020,26.08,81.1,1.1,6,4.87,9.93671,13,8,2020,0,35,47,38
3,Gasoline,Sedan,Lincoln,19.0,217,68504,3300.0,V6,White,False,18.5,58.1,28.0,293.0,Mixed Colors,False,40.852798,195.5,WHITE,-96.652496,Kia,5.0,Cadenza,15995.0,507,4.769231,B & D Auto Sales,255.0,A,Automatic,Front-Wheel Drive,112.0,72.8,2015,23.5,82.3,11.17,9,0.14,-0.04855,6,2,2020,5,14,36,31
4,Gasoline,Coupe,Conway,19.0,47,29526,3600.0,V6,Black,True,18.5,57.5,30.0,305.0,Other,False,33.8545,197.9,BLACK,-79.079498,Dodge,5.0,Challenger,28995.0,1148,4.153846,Conway Chrysler Dodge Jeep Ram,268.0,A,Automatic,Rear-Wheel Drive,116.2,85.4,2019,24.5,75.1,7.52,6,0.33,0.0167,25,7,2020,1,27,41,30


In [8]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=6, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.2, seed=42)  # Random sampling 20% of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Training Decision Tree Regressor model
    dt = DecisionTreeRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxDepth=15,
    maxBins=128,
    minInstancesPerNode=5,
    minInfoGain=0.01,
    seed=42
    )

    model = dt.fit(train_df)
    pbar.update(1)


# Making predictions
predictions = model.transform(test_df)

# Evaluating the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

# Displaying results
print(f"\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Multiplying R-Squared by 100 for percentage calculation
print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 6/6 [1:54:54<00:00, 1149.08s/it]



Train size: 480,411 samples
Test size: 120,366 samples


R-Squared Score (Accuracy): 88.38%

Overall runtime: 133 minutes.


In [9]:
new_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Decision_Tree_Regression_model_FE"

# Saving the trained model to the new path
model.save(new_model_path)

print(f"Model saved successfully at {new_model_path}")

Model saved successfully at /content/drive/MyDrive/Big Data Analytics - Project/models/Decision_Tree_Regression_model_FE


In [10]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)


print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3161
Mean Squared Error: 38316567
Root Mean Squared Error: 6190


In **Decision Tree Regressor**, the model saw an **improvement in R² accuracy by 5.64%**, while the Mean Absolute Error **(MAE) was reduced by 0.50%**, and the Root Mean Square Error **(RMSE) was reduced by 6.17%**, demonstrating enhanced predictive performance.

In [None]:
# Feature Importance (for Decision Trees)
feature_importance = model.featureImportances
features_list = pipeline_model.stages[-2].getInputCols()  # Get feature names from VectorAssembler

# Sorting features by their importance in descending order
sorted_features = [feature for feature, importance in sorted(zip(features_list, feature_importance), key=lambda x: x[1], reverse=True)]

# Printing ranked features from highest to lowest
print("Top 10 Features Ranked by Importance (Highest to Lowest):")
for rank, feature in enumerate(sorted_features[:10], 1):  # Limiting to top 10 features
    print(f"{rank}. {feature}")

Top 10 Features Ranked by Importance (Highest to Lowest):
1. maintenance_cost
2. log_mileage
3. horsepower
4. maximum_seating
5. torque
6. sp_name_encoded
7. hp_x_torque
8. fuel_tank_volume
9. width
10. is_new
