In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check
packages = [
    "tqdm",
    "dask",
    "nltk",
    "scikit-learn",
    "numpy",
    "pyspark",
    "gdown"
]

# Checking and installing the packages
for package in packages:
    check_and_install_package(package)



tqdm is already installed.

dask is already installed.

nltk is already installed.

scikit-learn is NOT installed. Installing now...
scikit-learn installation completed.

numpy is already installed.

pyspark is already installed.

gdown is already installed.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RandomForestModel") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.5.3


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying the first 5 rows
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,Pickup Truck,San Antonio,21.0,3,78238,2300.0,I4,Blue,True,18.0,71.2,26.0,270.0,Black,True,29.454,210.8,BLUE,-98.628799,Ford,5.0,Ranger,31830.0,0,3.583333,McCombs Ford West,265.22,A,Automatic,4X2,126.8,85.8,2020,23.5,77.6,8.91,5,-0.12,0.0,8,9,2020,0,31,40,33
1,Gasoline,SUV / Crossover,Norwood,21.0,6,2062,2000.0,I4,Silver,True,16.6,66.3,27.0,247.0,Other,False,42.193699,188.9,SILVER,-71.184097,Land Rover,5.0,Range Rover Velar,59900.0,4025,4.857143,Land Rover Norwood,269.0,A,8-Speed Automatic,All-Wheel Drive,113.1,84.4,2020,24.0,77.3,9.18,10,0.0,-0.00023,3,9,2020,0,34,43,41
2,Gasoline,SUV / Crossover,Kenmore,15.0,54,14217,3600.0,V6,Other,True,22.0,69.9,22.0,281.0,Other,False,42.972,203.7,GRAY,-78.871201,Chevrolet,8.0,Traverse,17888.0,1936,4.347826,Paddock Chevrolet,266.0,A,Automatic,Front-Wheel Drive,118.9,78.5,2017,18.5,78.1,10.66,5,0.19,0.00273,18,7,2020,3,20,38,33
3,Gasoline,SUV / Crossover,Brooklyn Center,26.0,131,55429,1300.0,I3,Other,True,13.2,64.1,29.0,150.0,Black,True,45.077702,171.4,UNKNOWN,-93.334702,Buick,5.0,Encore GX,24975.0,0,4.538462,Luther Brookdale Chevrolet Buick GMC,174.0,A,Automatic,Four-Wheel Drive,102.2,71.4,2020,27.5,76.9,8.91,1,1.39,0.93144,3,5,2020,0,22,35,30
4,Flex Fuel Vehicle,Minivan,Rio Linda,17.0,14,95673,3600.0,V6,Beige,False,20.0,67.9,25.0,283.0,Other,False,38.68,202.8,BROWN,-121.481003,Chrysler,7.0,Town & Country,19590.0,2456,3.666667,Carvana,260.0,A,6-Speed Automatic,Front-Wheel Drive,121.2,88.5,2016,21.0,77.2,10.61,4,0.2,-0.01933,28,8,2020,4,23,37,32




---



## **Random Forest**

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from tqdm import tqdm
import warnings
import time

# Start to track runtime
start_time = time.time()

# Processing and preparing the test data
with tqdm(total=5, desc="Processing Test Data") as pbar:

    # Random sampling 20% of the data (or use the same fraction as training if needed)
    df_sample = df.sample(fraction=0.2, seed=42)
    pbar.update(1)

    # Filtering out rows where 'price' <= 0 to avoid log transformation issues
    df_sample = df_sample.filter(F.col("price") > 0)

    # Log transforming the target variable
    df_sample = df_sample.withColumn("log_price", F.log("price"))
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Convert 'franchise_dealer' to numeric if applicable
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assemble features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price', 'log_price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline (scaling the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data into training and test sets
    _, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)  # only keeping test_df here for evaluation
    pbar.update(1)

print("Data preprocessing for test data completed.")


Processing Test Data: 100%|██████████| 5/5 [00:20<00:00,  4.15s/it]

Data preprocessing for test data completed.





In [None]:
# Path where the model is saved
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Random_Forest_model_FE"

# Load the saved Random Forest model
loaded_model = RandomForestRegressionModel.load(saved_model_path)
print(f"Model loaded successfully from {saved_model_path}")

# Make predictions on the test data
print("Making predictions with the loaded model on test data...")
predictions = loaded_model.transform(test_df)

# Exponentiate predictions to get them back in the original scale
predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))

# Evaluate the loaded model
print("Evaluating the loaded model...")

# R2 Score
evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/Random_Forest_model_FE
Making predictions with the loaded model on test data...
Evaluating the loaded model...

R-Squared Score (Accuracy): 86.35%

Additional Metrics:
Mean Absolute Error: 3488
Mean Squared Error: 44980420
Root Mean Squared Error: 6707


In [None]:
# Path where the model is saved
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Random_Forest_model_FE"

# Load the saved Random Forest model
loaded_model = RandomForestRegressionModel.load(saved_model_path)

# Make predictions on the test data
predictions = loaded_model.transform(test_df)

# Exponentiate predictions to get them back in the original scale
predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))

# Calculate Mean Absolute Percentage Error (MAPE)
mape = predictions.withColumn("percentage_error", F.abs((F.col("price") - F.col("exp_prediction")) / F.col("price")) * 100)
mape_value = mape.select(F.mean("percentage_error")).collect()[0][0]

# Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
smape = predictions.withColumn(
    "smape_error",
    F.abs(F.col("price") - F.col("exp_prediction")) / ((F.abs(F.col("price")) + F.abs(F.col("exp_prediction"))) / 2) * 100
)
smape_value = smape.select(F.mean("smape_error")).collect()[0][0]

print(f"Mean Absolute Percentage Error (MAPE): {round(mape_value, 2)}%")
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {round(smape_value, 2)}%")


Mean Absolute Percentage Error (MAPE): 12.1%
Symmetric Mean Absolute Percentage Error (SMAPE): 11.8%


In [None]:
from pyspark.sql import functions as F

# Path where the model is saved
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Random_Forest_model_FE"

# Load the saved Random Forest model
loaded_model = RandomForestRegressionModel.load(saved_model_path)

# Make predictions on the test data
predictions = loaded_model.transform(test_df)

# Exponentiate predictions to get them back in the original scale
predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))

# Define error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculate the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("exp_prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("exp_prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100

    distribution_results[label] = percentage_in_range

# Print the distribution table
print("Error Range Distribution Table for Random Forest:\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")


Error Range Distribution Table for Random Forest:

Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 54.00     
10-20%          | 30.14     
20-30%          | 9.97      
30-40%          | 3.14      
40-50%          | 1.22      
50-60%          | 0.61      
60-70%          | 0.33      
70-80%          | 0.17      
80-90%          | 0.12      
90-100%         | 0.08      


In [None]:
tolerance_levels = [i / 100 for i in range(10, 51, 10)]  # [0.10, 0.20, 0.30, 0.40, 0.50]
accuracy_results = {}  # Dictionary to store the accuracy results for each tolerance level

# Calculate accuracy for each tolerance level
for tolerance in tolerance_levels:
    within_tolerance = predictions.withColumn(
        "within_tolerance",
        F.when(F.abs((F.col("price") - F.col("exp_prediction")) / F.col("price")) <= tolerance, 1).otherwise(0)
    )

    # Compute the accuracy by averaging the 'within_tolerance' column
    accuracy = within_tolerance.agg(F.mean("within_tolerance")).collect()[0][0] * 100

    # Store the result in the dictionary
    accuracy_results[f"{int(tolerance * 100)}%"] = accuracy
    # print(f"Accuracy within {int(tolerance * 100)}% tolerance: {accuracy:.2f}%")

# Display the results for each tolerance level
print("Summary of Accuracy Results:")
for tolerance, acc in accuracy_results.items():
    print(f"Tolerance Level: {tolerance} - Accuracy: {acc:.2f}%")


Summary of Accuracy Results:
Tolerance Level: 10% - Accuracy: 54.00%
Tolerance Level: 20% - Accuracy: 84.14%
Tolerance Level: 30% - Accuracy: 94.11%
Tolerance Level: 40% - Accuracy: 97.26%
Tolerance Level: 50% - Accuracy: 98.48%




---



## **Linear Regression**

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from tqdm import tqdm
import warnings
import time

# Start to track runtime
start_time = time.time()

# Processing and preparing the test data
with tqdm(total=4, desc="Processing Test Data") as pbar:

    # Sample the data for testing (20% fraction used in training)
    df_sample = df.sample(fraction=0.2, seed=42)
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if applicable
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assemble features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline (scaling the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data into test set only for evaluation
    _, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("Data preprocessing for test data completed.")


Processing Test Data: 100%|██████████| 4/4 [00:24<00:00,  6.01s/it]

Data preprocessing for test data completed.





In [None]:
# Path where the saved model is located
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Linear_Regression_model_FE"

# Load the saved Linear Regression model
loaded_model = LinearRegressionModel.load(saved_model_path)
print(f"Model loaded successfully from {saved_model_path}")

# Make predictions on the test data
print("Making predictions with the loaded model on test data...")
predictions = loaded_model.transform(test_df)

# Evaluate the loaded model
print("Evaluating the loaded model...")

# R2 Score
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/Linear_Regression_model_FE
Making predictions with the loaded model on test data...
Evaluating the loaded model...

R-Squared Score (Accuracy): 84.20%

Additional Metrics:
Mean Absolute Error: 4131
Mean Squared Error: 52079104
Root Mean Squared Error: 7217


In [None]:
# Load the saved Linear Regression model
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Linear_Regression_model_FE"
loaded_model = LinearRegressionModel.load(saved_model_path)

predictions = loaded_model.transform(test_df)

# function to calculate MAPE
def calculate_mape(df, label_col="price", prediction_col="prediction"):
    mape_df = df.withColumn("abs_percentage_error",
                            F.abs((F.col(label_col) - F.col(prediction_col)) / F.col(label_col)))
    mape = mape_df.select(F.mean("abs_percentage_error")).collect()[0][0]
    return mape * 100  # MAPE as a percentage

# function to calculate SMAPE
def calculate_smape(df, label_col="price", prediction_col="prediction"):
    smape_df = df.withColumn("symmetric_absolute_percentage_error",
                             2 * F.abs(F.col(label_col) - F.col(prediction_col)) /
                             (F.abs(F.col(label_col)) + F.abs(F.col(prediction_col)))
                            )
    smape = smape_df.select(F.mean("symmetric_absolute_percentage_error")).collect()[0][0]
    return smape * 100  # SMAPE as a percentage

# Calculate MAPE
mape = calculate_mape(predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Calculate SMAPE
smape = calculate_smape(predictions)
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {smape:.2f}%")


Mean Absolute Percentage Error (MAPE): 18.82%
Symmetric Mean Absolute Percentage Error (SMAPE): 18.45%


In [None]:
from pyspark.sql import functions as F

# error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculate the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100

    distribution_results[label] = percentage_in_range

print("Error Range Distribution Table for GBT:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")


Error Range Distribution Table for Linear Regression:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 46.73     
10-20%          | 28.02     
20-30%          | 11.35     
30-40%          | 4.72      
40-50%          | 2.49      
50-60%          | 1.56      
60-70%          | 1.01      
70-80%          | 0.78      
80-90%          | 0.63      
90-100%         | 0.44      


In [None]:
tolerance_levels = [i / 100 for i in range(10, 51, 10)]  # [0.10, 0.20, 0.30, 0.40, 0.50]
accuracy_results = {}  # Dictionary to store the accuracy results for each tolerance level

# Calculate accuracy for each tolerance level
for tolerance in tolerance_levels:
    within_tolerance = predictions.withColumn(
        "within_tolerance",
        F.when(F.abs((F.col("price") - F.col("prediction")) / F.col("price")) <= tolerance, 1).otherwise(0)
    )

    # Compute the accuracy by averaging the 'within_tolerance' column
    accuracy = within_tolerance.agg(F.mean("within_tolerance")).collect()[0][0] * 100

    # Store the result in the dictionary
    accuracy_results[f"{int(tolerance * 100)}%"] = accuracy
    # print(f"Accuracy within {int(tolerance * 100)}% tolerance: {accuracy:.2f}%")

# Display the results for each tolerance level
print("Summary of Accuracy Results:")
for tolerance, acc in accuracy_results.items():
    print(f"Tolerance Level: {tolerance} - Accuracy: {acc:.2f}%")


Summary of Accuracy Results:
Tolerance Level: 10% - Accuracy: 46.73%
Tolerance Level: 20% - Accuracy: 74.75%
Tolerance Level: 30% - Accuracy: 86.10%
Tolerance Level: 40% - Accuracy: 90.82%
Tolerance Level: 50% - Accuracy: 93.31%




---



## **GBT regressor**

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from tqdm import tqdm
import warnings
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Start to track runtime
start_time = time.time()

print("Processing the test data...")

# Processing and preparing the test data
with tqdm(total=4, desc="Progress") as pbar:

    # Sample the data for testing (20% fraction used in training)
    df_sample = df.sample(fraction=0.2, seed=42)
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if applicable
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assemble features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline (scaling the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    test_df = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data into test set only for evaluation
    _, test_df = test_df.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("Data preprocessing for test data completed.")


Processing the test data...


Progress: 100%|██████████| 4/4 [00:13<00:00,  3.46s/it]

Data preprocessing for test data completed.





In [None]:
# Load the saved GBT model
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/GBT_model_FE"
loaded_model = GBTRegressionModel.load(saved_model_path)
print(f"Model loaded successfully from {saved_model_path}")

# Make predictions on the test data
print("Making predictions with the loaded model on test data...")
predictions = loaded_model.transform(test_df)

# Evaluate the loaded model
print("Evaluating the loaded model...")

# R2 Score
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/GBT_model_FE
Making predictions with the loaded model on test data...
Evaluating the loaded model...

R-Squared Score (Accuracy): 88.57%

Additional Metrics:
Mean Absolute Error: 3649
Mean Squared Error: 37663588
Root Mean Squared Error: 6137


In [None]:
# Load the saved Linear Regression model
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/GBT_model_FE"
loaded_model = GBTRegressionModel.load(saved_model_path)

predictions = loaded_model.transform(test_df)

# function to calculate MAPE
def calculate_mape(df, label_col="price", prediction_col="prediction"):
    mape_df = df.withColumn("abs_percentage_error",
                            F.abs((F.col(label_col) - F.col(prediction_col)) / F.col(label_col)))
    mape = mape_df.select(F.mean("abs_percentage_error")).collect()[0][0]
    return mape * 100  # MAPE as a percentage

# function to calculate SMAPE
def calculate_smape(df, label_col="price", prediction_col="prediction"):
    smape_df = df.withColumn("symmetric_absolute_percentage_error",
                             2 * F.abs(F.col(label_col) - F.col(prediction_col)) /
                             (F.abs(F.col(label_col)) + F.abs(F.col(prediction_col)))
                            )
    smape = smape_df.select(F.mean("symmetric_absolute_percentage_error")).collect()[0][0]
    return smape * 100  # SMAPE as a percentage

# Calculate MAPE
mape = calculate_mape(predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Calculate SMAPE
smape = calculate_smape(predictions)
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {smape:.2f}%")


Mean Absolute Percentage Error (MAPE): 14.43%
Symmetric Mean Absolute Percentage Error (SMAPE): 13.56%


In [None]:
from pyspark.sql import functions as F

# error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculate the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100

    distribution_results[label] = percentage_in_range

print("Error Range Distribution Table for GBT:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")


Error Range Distribution Table for GBT:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 49.23     
10-20%          | 29.53     
20-30%          | 11.95     
30-40%          | 4.62      
40-50%          | 2.01      
50-60%          | 0.93      
60-70%          | 0.56      
70-80%          | 0.32      
80-90%          | 0.20      
90-100%         | 0.13      


In [None]:
tolerance_levels = [i / 100 for i in range(10, 51, 10)]  # [0.10, 0.20, 0.30, 0.40, 0.50]
accuracy_results = {}  # Dictionary to store the accuracy results for each tolerance level

# Calculate accuracy for each tolerance level
for tolerance in tolerance_levels:
    within_tolerance = predictions.withColumn(
        "within_tolerance",
        F.when(F.abs((F.col("price") - F.col("prediction")) / F.col("price")) <= tolerance, 1).otherwise(0)
    )

    # Compute the accuracy by averaging the 'within_tolerance' column
    accuracy = within_tolerance.agg(F.mean("within_tolerance")).collect()[0][0] * 100

    # Store the result in the dictionary
    accuracy_results[f"{int(tolerance * 100)}%"] = accuracy
    # print(f"Accuracy within {int(tolerance * 100)}% tolerance: {accuracy:.2f}%")

# Display the results for each tolerance level
print("Summary of Accuracy Results:")
for tolerance, acc in accuracy_results.items():
    print(f"Tolerance Level: {tolerance} - Accuracy: {acc:.2f}%")


Summary of Accuracy Results:
Tolerance Level: 10% - Accuracy: 49.23%
Tolerance Level: 20% - Accuracy: 78.75%
Tolerance Level: 30% - Accuracy: 90.70%
Tolerance Level: 40% - Accuracy: 95.33%
Tolerance Level: 50% - Accuracy: 97.33%




---



## **Decision Tree Regressor**

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from tqdm import tqdm
import warnings
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Start to track runtime
start_time = time.time()

print("Processing the test data...")

# Processing and preparing the test data
with tqdm(total=4, desc="Progress") as pbar:

    # Sample the data for testing (20% fraction used in training)
    df_sample = df.sample(fraction=0.2, seed=42)
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric if applicable
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assemble features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]

    # Adding scaling to the pipeline (scaling the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    test_df = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data into test set only for evaluation
    _, test_df = test_df.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("Data preprocessing for test data completed.")

Processing the test data...


Progress: 100%|██████████| 4/4 [00:19<00:00,  4.99s/it]

Data preprocessing for test data completed.





In [None]:
# Load the saved Decision Tree model
saved_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/Decision_Tree_Regression_model_FE"
loaded_model = DecisionTreeRegressionModel.load(saved_model_path)
print(f"Model loaded successfully from {saved_model_path}")

# Make predictions on the test data
print("Making predictions with the loaded model on test data...")
predictions = loaded_model.transform(test_df)

# Evaluate the loaded model
print("Evaluating the loaded model...")

# R2 Score
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/Decision_Tree_Regression_model_FE
Making predictions with the loaded model on test data...
Evaluating the loaded model...

R-Squared Score (Accuracy): 88.38%

Additional Metrics:
Mean Absolute Error: 3161
Mean Squared Error: 38316567
Root Mean Squared Error: 6190


In [None]:
from pyspark.sql import functions as F
from pyspark.ml.regression import DecisionTreeRegressionModel

# function to calculate MAPE
def calculate_mape(df, label_col="price", prediction_col="prediction"):
    mape_df = df.withColumn("abs_percentage_error",
                            F.abs((F.col(label_col) - F.col(prediction_col)) / F.col(label_col)))
    mape = mape_df.select(F.mean("abs_percentage_error")).collect()[0][0]
    return mape * 100  # MAPE as a percentage

# function to calculate SMAPE
def calculate_smape(df, label_col="price", prediction_col="prediction"):
    smape_df = df.withColumn("symmetric_absolute_percentage_error",
                             2 * F.abs(F.col(label_col) - F.col(prediction_col)) /
                             (F.abs(F.col(label_col)) + F.abs(F.col(prediction_col)))
                            )
    smape = smape_df.select(F.mean("symmetric_absolute_percentage_error")).collect()[0][0]
    return smape * 100  # SMAPE as a percentage

# Calculate MAPE
mape = calculate_mape(predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Calculate SMAPE
smape = calculate_smape(predictions)
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {smape:.2f}%")


Mean Absolute Percentage Error (MAPE): 11.78%
Symmetric Mean Absolute Percentage Error (SMAPE): 11.13%


In [None]:
from pyspark.sql import functions as F

# error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculate the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100

    distribution_results[label] = percentage_in_range

print("Error Range Distribution Table for Decision Trees:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")


Error Range Distribution Table for Decision Trees:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 58.66     
10-20%          | 26.57     
20-30%          | 8.76      
30-40%          | 2.94      
40-50%          | 1.22      
50-60%          | 0.64      
60-70%          | 0.39      
70-80%          | 0.22      
80-90%          | 0.13      
90-100%         | 0.11      


In [None]:
tolerance_levels = [i / 100 for i in range(10, 51, 10)]  # [0.10, 0.20, 0.30, 0.40, 0.50]
accuracy_results = {}  # Dictionary to store the accuracy results for each tolerance level

# Calculate accuracy for each tolerance level
for tolerance in tolerance_levels:
    within_tolerance = predictions.withColumn(
        "within_tolerance",
        F.when(F.abs((F.col("price") - F.col("prediction")) / F.col("price")) <= tolerance, 1).otherwise(0)
    )

    # Compute the accuracy by averaging the 'within_tolerance' column
    accuracy = within_tolerance.agg(F.mean("within_tolerance")).collect()[0][0] * 100

    # Store the result in the dictionary
    accuracy_results[f"{int(tolerance * 100)}%"] = accuracy
    # print(f"Accuracy within {int(tolerance * 100)}% tolerance: {accuracy:.2f}%")

# Display the results for each tolerance level
print("Summary of Accuracy Results:")
for tolerance, acc in accuracy_results.items():
    print(f"Tolerance Level: {tolerance} - Accuracy: {acc:.2f}%")


Summary of Accuracy Results:
Tolerance Level: 10% - Accuracy: 58.66%
Tolerance Level: 20% - Accuracy: 85.23%
Tolerance Level: 30% - Accuracy: 93.99%
Tolerance Level: 40% - Accuracy: 96.94%
Tolerance Level: 50% - Accuracy: 98.15%
