In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.1.1"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.22.4"},
    {"name": "xgboost", "version": '1.5.0'},
    {"name": "sparkxgb", "version": None},
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])


tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.

xgboost is already installed.

sparkxgb is already installed.


In [None]:
!pip install xgboost==1.5.0



In [None]:
!pip install numpy==1.22.4



In [None]:
!pip install sparkxgb



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

# Defining local resources directory
local_resources_path = "/resources"
os.makedirs(local_resources_path, exist_ok=True)

# Defining the source paths from your mounted Google Drive
xgboost4j_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j_2.12-1.7.6.jar"
xgboost4j_spark_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Defining the destination paths in the instance's local file system
xgboost4j_dest = os.path.join(local_resources_path, "xgboost4j_2.12-1.7.6.jar")
xgboost4j_spark_dest = os.path.join(local_resources_path, "xgboost4j-spark_2.12-1.7.6.jar")

# Copying the files from Google Drive to the local instance
shutil.copyfile(xgboost4j_source, xgboost4j_dest)
shutil.copyfile(xgboost4j_spark_source, xgboost4j_spark_dest)

# Verifying that the files are copied
print(f"Jar Files copied to: {local_resources_path}")
print(os.listdir(local_resources_path))


Jar Files copied to: /resources
['xgboost4j-spark_2.12-1.7.6.jar', 'xgboost4j_2.12-1.7.6.jar']


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.wrapper import JavaModel

# Defining the path to the jar files
jar_files = "/resources/xgboost4j_2.12-1.7.6.jar,/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Initialize Spark session with your configuration
spark = SparkSession.builder \
    .appName("BoostingModel") \
    .config("spark.driver.memory", "150g") \
    .config("spark.executor.memory", "150g") \
    .config("spark.driver.maxResultSize", "50g") \
    .config("spark.executor.memoryOverhead", "50g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .config("spark.jars", jar_files) \
    .getOrCreate()

# Verify Spark session
print(f"Spark session started with version: {spark.version}")


Spark session started with version: 3.1.1


In [None]:
# Testing if sparkxgb is loaded properly
try:
    from sparkxgb import XGBoostRegressor

    model = XGBoostRegressor()
    print("sparkxgb loaded successfully!")
except Exception as e:
    print(f"Error loading sparkxgb: {e}")


sparkxgb loaded successfully!


In [None]:
from pyspark.ml.wrapper import JavaModel

# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    xgb_model = JavaModel(java_model)
    print(f"XGBoost model loaded successfully from {xgb_model_path}")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

XGBoost model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,Pickup Truck,Desoto,17.0,203,75115,5300.0,V8,Silver,True,24.0,75.5,23.0,355.0,Black,True,32.599098,231.7,SILVER,-96.822899,Chevrolet,6.0,Silverado 1500,46700.0,0,4.785714,Chuck Fairbanks Chevrolet,383.0,A,Automatic,4X2,147.4,81.2,2020,20.0,87.9,0.69,8,2.19,1.32358,21,2,2020,0,30,44,35
1,Gasoline,SUV / Crossover,Johnstown,26.0,12,15904,2500.0,H4,Gray,True,16.6,68.1,33.0,182.0,Gray,True,40.283798,182.1,GRAY,-78.848099,Subaru,5.0,Forester,28752.0,0,4.857143,Spangler Subaru,176.0,CVT,Continuously Variable Transmission,All-Wheel Drive,105.1,81.3,2020,29.5,82.7,8.91,0,0.25,0.6123,29,8,2020,0,29,40,29
2,Gasoline,Pickup Truck,Winchester,22.690001,26,22601,2500.0,V6,Red,True,18.030001,65.87,29.469999,241.0,Gray,False,39.156898,189.8,RED,-78.186897,Toyota,5.0,Tacoma,31525.0,2347,5.0,Parsons Kia,265.22,A,Automatic,Unknown,111.0,77.2,2018,26.08,80.16,10.47,3,0.03,0.0,15,8,2020,2,26,38,29
3,Gasoline,Sedan,Carson,27.0,49,90745,2500.0,I4,Other,True,16.200001,56.9,37.0,188.0,Black,True,33.831799,192.9,UNKNOWN,-118.246002,Nissan,5.0,Altima,31828.0,0,4.5,Carson Nissan,180.0,A,Automatic,Front-Wheel Drive,111.2,72.9,2020,32.0,79.0,1.39,11,0.23,0.53135,25,7,2020,0,30,37,33
4,Gasoline,Sedan,Greenfield,29.0,48,53228,2000.0,I4,Other,True,12.4,56.9,39.0,149.0,Black,True,42.999001,182.7,UNKNOWN,-88.045303,Nissan,5.0,Sentra,21091.0,0,4.636364,Gordie Boucher Nissan of Greenfield,146.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,106.8,71.5,2020,34.0,81.4,2.3,7,0.8,1.22982,24,7,2020,0,27,37,31




---



In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=6, desc="Progress") as pbar:

    df_sample = df.sample(fraction=0.1, seed=42)   # Randomly sampling 10% of the data
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and apply the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Filling in missing values
    for col in df_sample.columns:
        if df_sample.schema[col].dataType.typeName() in ["double", "float", "int", "long"]:
            mean_value = df_sample.select(sql_mean(col)).first()[0]
            df_sample = df_sample.na.fill({col: mean_value})
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")


Processing the data...


Progress: 100%|██████████| 6/6 [00:36<00:00,  6.07s/it]



Data preprocessing and splitting completed!





In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from sparkxgb import XGBoostRegressionModel
import pyspark.sql.functions as F
from pyspark.ml.wrapper import JavaModel

# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    loaded_model = JavaModel(java_model)
    print(f"XGBoost model loaded successfully from {xgb_model_path}")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# Make predictions on the test data
print("Making predictions with the loaded model on test data...")
predictions = loaded_model.transform(test_df)

# Evaluate the loaded model
print("Evaluating the loaded model...")

# R2 Score
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

XGBoost model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE
Making predictions with the loaded model on test data...
Evaluating the loaded model...

R-Squared Score (Accuracy): 91.84%

Additional Metrics:
Mean Absolute Error: 3018
Mean Squared Error: 27751838
Root Mean Squared Error: 5268


In [None]:
from sparkxgb import XGBoostRegressionModel
import pyspark.sql.functions as F

# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    loaded_model = JavaModel(java_model)
    print(f"XGBoost model loaded.")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# Make predictions on the test data
predictions = loaded_model.transform(test_df)

# Define a function to calculate MAPE
def calculate_mape(df, label_col="price", prediction_col="prediction"):
    mape_df = df.withColumn("abs_percentage_error",
                            F.abs((F.col(label_col) - F.col(prediction_col)) / F.col(label_col)))
    mape = mape_df.select(F.mean("abs_percentage_error")).collect()[0][0]
    return mape * 100  # MAPE as a percentage

# Define a function to calculate SMAPE
def calculate_smape(df, label_col="price", prediction_col="prediction"):
    smape_df = df.withColumn("symmetric_absolute_percentage_error",
                             2 * F.abs(F.col(label_col) - F.col(prediction_col)) /
                             (F.abs(F.col(label_col)) + F.abs(F.col(prediction_col)))
                            )
    smape = smape_df.select(F.mean("symmetric_absolute_percentage_error")).collect()[0][0]
    return smape * 100  # SMAPE as a percentage

# Calculate MAPE
mape_value = calculate_mape(predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape_value:.2f}%")

# Calculate SMAPE
smape_value = calculate_smape(predictions)
print(f"Symmetric Mean Absolute Percentage Error (SMAPE): {smape_value:.2f}%")


XGBoost model loaded.
Mean Absolute Percentage Error (MAPE): 11.76%
Symmetric Mean Absolute Percentage Error (SMAPE): 11.04%


In [None]:
from pyspark.sql import functions as F

# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    loaded_model = JavaModel(java_model)
    print(f"XGBoost model loaded.")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# Make predictions on the test data
predictions = loaded_model.transform(test_df)

# Define error ranges for the distribution table up to 100%
error_ranges = {
    "0-10%": (0.0, 0.10),
    "10-20%": (0.10, 0.20),
    "20-30%": (0.20, 0.30),
    "30-40%": (0.30, 0.40),
    "40-50%": (0.40, 0.50),
    "50-60%": (0.50, 0.60),
    "60-70%": (0.60, 0.70),
    "70-80%": (0.70, 0.80),
    "80-90%": (0.80, 0.90),
    "90-100%": (0.90, 1.0)
}

distribution_results = {}

# Calculate the distribution for each error range
for label, (lower, upper) in error_ranges.items():
    within_range = predictions.withColumn(
        "in_range",
        F.when(
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) >= lower) &
            (F.abs((F.col("price") - F.col("prediction")) / F.col("price")) < upper),
            1
        ).otherwise(0)
    )

    percentage_in_range = within_range.agg(F.mean("in_range")).collect()[0][0] * 100

    distribution_results[label] = percentage_in_range

# Print the distribution table
print("Error Range Distribution Table for XGBoost:")
print("\n")
print(f"{'Error Range':<15} | {'Percentage of Total Predictions (%)':<10}")
print("-" * 50)
for error_range, percentage in distribution_results.items():
    print(f"{error_range:<15} | {percentage:<10.2f}")


XGBoost model loaded.
Error Range Distribution Table for XGBoost:


Error Range     | Percentage of Total Predictions (%)
--------------------------------------------------
0-10%           | 57.57     
10-20%          | 28.75     
20-30%          | 8.57      
30-40%          | 2.54      
40-50%          | 1.00      
50-60%          | 0.50      
60-70%          | 0.31      
70-80%          | 0.16      
80-90%          | 0.11      
90-100%         | 0.08      


In [None]:
# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    loaded_model = JavaModel(java_model)
    print(f"XGBoost model loaded.")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# Make predictions on the test data
predictions = loaded_model.transform(test_df)

tolerance_levels = [i / 100 for i in range(10, 51, 10)]  # [0.10, 0.20, 0.30, 0.40, 0.50]
accuracy_results = {}  # Dictionary to store the accuracy results for each tolerance level

# Calculate accuracy for each tolerance level
for tolerance in tolerance_levels:
    within_tolerance = predictions.withColumn(
        "within_tolerance",
        F.when(F.abs((F.col("price") - F.col("prediction")) / F.col("price")) <= tolerance, 1).otherwise(0)
    )

    # Compute the accuracy by averaging the 'within_tolerance' column
    accuracy = within_tolerance.agg(F.mean("within_tolerance")).collect()[0][0] * 100

    # Store the result in the dictionary
    accuracy_results[f"{int(tolerance * 100)}%"] = accuracy
    # print(f"Accuracy within {int(tolerance * 100)}% tolerance: {accuracy:.2f}%")

# Display the results for each tolerance level
print("Summary of Accuracy Results:")
for tolerance, acc in accuracy_results.items():
    print(f"Tolerance Level: {tolerance} - Accuracy: {acc:.2f}%")


XGBoost model loaded.
Summary of Accuracy Results:
Tolerance Level: 10% - Accuracy: 57.57%
Tolerance Level: 20% - Accuracy: 86.33%
Tolerance Level: 30% - Accuracy: 94.89%
Tolerance Level: 40% - Accuracy: 97.43%
Tolerance Level: 50% - Accuracy: 98.43%
