In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.1.1"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.22.4"},
    {"name": "xgboost", "version": None},
    {"name": "sparkxgb", "version": None},
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])


tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.

xgboost is already installed.

sparkxgb is already installed.


In [None]:
!pip install numpy==1.22.4



In [None]:
import numpy
print(numpy.__version__)

1.22.4


In [None]:
!pip install sparkxgb



In [None]:
import os
import shutil

from google.colab import drive
drive.mount('/content/drive')

# Defining local resources directory
local_resources_path = "/resources"
os.makedirs(local_resources_path, exist_ok=True)

# Defining the source paths from mounted Google Drive
xgboost4j_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j_2.12-1.7.6.jar"
xgboost4j_spark_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Defining the destination paths in the instance's local file system
xgboost4j_dest = os.path.join(local_resources_path, "xgboost4j_2.12-1.7.6.jar")
xgboost4j_spark_dest = os.path.join(local_resources_path, "xgboost4j-spark_2.12-1.7.6.jar")

# Copying the files from Google Drive to the local instance
shutil.copyfile(xgboost4j_source, xgboost4j_dest)
shutil.copyfile(xgboost4j_spark_source, xgboost4j_spark_dest)

# Verifying that the files are copied
print(f"Jar Files copied to: {local_resources_path}")
print(os.listdir(local_resources_path))


Mounted at /content/drive
Jar Files copied to: /resources
['xgboost4j_2.12-1.7.6.jar', 'xgboost4j-spark_2.12-1.7.6.jar']


In [None]:
from pyspark.sql import SparkSession

# Defining the path to the copied jar files in the local instance
jar_files = "/resources/xgboost4j_2.12-1.7.6.jar,/resources/xgboost4j-spark_2.12-1.7.6.jar"

spark = SparkSession.builder \
    .appName("BERT_Experimental_EDA") \
    .config("spark.driver.memory", "200g") \
    .config("spark.executor.memory", "200g") \
    .config("spark.driver.maxResultSize", "50g") \
    .config("spark.executor.memoryOverhead", "25g") \
    .config("spark.executor.cores", "8") \
    .config("spark.sql.broadcastTimeout", "1800") \
    .config("spark.network.timeout", "1000s") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "5") \
    .config("spark.dynamicAllocation.maxExecutors", "20") \
    .config("spark.sql.shuffle.partitions", "300") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .config("spark.jars", jar_files) \
    .getOrCreate()

print(f"Spark session started with version: {spark.version}")


Spark session started with version: 3.1.1


In [None]:
# Testing if sparkxgb is loaded properly
try:
    from sparkxgb import XGBoostRegressor

    model = XGBoostRegressor()
    print("sparkxgb loaded successfully!")
except Exception as e:
    print(f"Error loading sparkxgb: {e}")

sparkxgb loaded successfully!


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Processed_DF.parquet' /content/

output_path = '/content/Processed_DF.parquet'
df = spark.read.parquet(output_path)

In [None]:
df = df.repartition(100)

In [None]:
print(f"DataFrame has {df.count()} rows and {len(df.columns)} columns")

DataFrame has 3000040 rows and 42 columns


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,description,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,major_options,make_name,maximum_seating,mileage,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count
0,Gasoline,SUV / Crossover,Harlingen,31.0,21,78552,Super Black 2020 Nissan Kicks S FWD CVT with X...,1600.0,I4,super black,True,10.8,62.4,36.0,122.0,black (charcoal),True,26.187201,169.1,2020-08-22,BLACK,-97.737396,"['Steel Wheels', 'Blind Spot Monitoring', 'Par...",Nissan,5.0,14.0,Kicks,18595.0,0,3.5,Charlie Clark Nissan,114.0,CVT,Continuously Variable Transmission,Front-Wheel Drive,103.1,69.3,2020,33.5,76.9,2.64,3
1,Gasoline,Pickup Truck,New Smyrna Beach,15.0,33,32168,"Vehicle Runs Well, Smooth ride, Regularly main...",3500.0,V6,black,False,36.0,76.7,21.0,365.0,no color,False,29.047899,231.9,2020-08-08,BLACK,-80.936096,"['Leather Seats', 'Sunroof/Moonroof', 'Navigat...",Ford,6.0,80101.0,F-150,27000.0,0,4.553192,Lumin Auto Group,420.0,A,6-Speed Automatic,Four-Wheel Drive,144.5,97.0,2014,18.0,84.9,11.29,6
2,Gasoline,Pickup Truck,Denver,16.0,64,80222,2020 Ford F-150 King Ranch EcoBoost 3.5L V6 GT...,3500.0,V6,star white metallic tri-coat,True,26.0,77.3,22.0,375.0,white (java),True,39.678501,243.7,2020-07-10,WHITE,-104.935997,"['Navigation System', 'Alloy Wheels', 'Technol...",Ford,5.0,9.0,F-150,59784.0,0,4.384615,Freeway Ford,470.0,A,Automatic,Four-Wheel Drive,156.8,96.8,2020,19.0,87.5,2.2,12
3,Gasoline,SUV / Crossover,Miami,21.0,88,33126,This 2017 Land Rover Range Rover Evoque 4dr SE...,2000.0,I4,fuji white,False,18.1,64.4,29.0,240.0,brown (beige),False,25.778999,172.1,2020-06-14,WHITE,-80.263702,"['Leather Seats', 'Navigation System', 'Alloy ...",Land Rover,5.0,33306.0,Range Rover Evoque,24691.0,4236,2.398601,Car Factory Outlet Miami,250.0,A,9-Speed Automatic,All-Wheel Drive,104.8,82.3,2017,25.0,76.5,10.41,8
4,Gasoline,Sedan,New Castle,22.690001,23,19720,Unknown,1400.0,I4,black,True,13.7,57.4,29.469999,153.0,none,False,39.675499,183.7,2020-08-18,BLACK,-75.594498,"['Navigation System', 'Alloy Wheels', 'Bluetoo...",Chevrolet,5.0,45696.0,Cruze,14993.0,274,4.571429,NuCar Chevrolet Mazda,265.22,A,Automatic,Front-Wheel Drive,106.3,70.5,2017,26.08,78.1,10.73,4


In [None]:
# Counting unique values in 'exterior_color' and 'interior_color' columns
exterior_colors_count = df.select('exterior_color').distinct().count()
interior_colors_count = df.select('interior_color').distinct().count()

print(f"Unique exterior colors: {exterior_colors_count}")
print(f"Unique interior colors: {interior_colors_count}")

Unique exterior colors: 23036
Unique interior colors: 38528


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

# Listing top colors for both exterior and interior
colors = ['White', 'Black', 'Gray', 'Silver', 'Red', 'Blue', 'Brown', 'Green', 'Beige', 'Orange', 'Gold', 'Yellow', 'Purple']

# Creating a UDF to find colors in the color columns
@F.udf(returnType=ArrayType(StringType()))
def find_colors(color_string):
    if color_string is None or color_string.strip() == "":
        return ["Other"]  # Handle empty or null values
    found_colors = [c for c in colors if c.lower() in color_string.lower()]
    return found_colors if found_colors else ["Other"]  # Label non-matching colors as "Other"

# Applying the UDF to both the exterior and interior color columns
df = df.withColumn("exterior_color_array", find_colors("exterior_color"))
df = df.withColumn("interior_color_array", find_colors("interior_color"))

# Creating a column with the count of colors found for both exterior and interior
df = df.withColumn("exterior_color_count", F.size("exterior_color_array"))
df = df.withColumn("interior_color_count", F.size("interior_color_array"))

# Joining the color arrays into string columns
df = df.withColumn("exterior_color", F.array_join("exterior_color_array", ", "))
df = df.withColumn("interior_color", F.array_join("interior_color_array", ", "))

# Labeling mixed colors for both exterior and interior colors
df = df.withColumn(
    "exterior_color",
    F.when(F.col("exterior_color_count") > 1, "Mixed Colors")
     .otherwise(F.col("exterior_color")))

df = df.withColumn(
    "interior_color",
    F.when(F.col("interior_color_count") > 1, "Mixed Colors")
     .otherwise(F.col("interior_color")))

# Dropping temporary columns
df = df.drop("exterior_color_array", "exterior_color_count", "interior_color_array", "interior_color_count")


In [None]:
# Counting the occurrences of each exterior and interior color and calculating percentages
exterior_color_counts = df.groupBy("exterior_color").count().withColumn(
    "percentage", F.round((F.col("count") / df.count()) * 100, 2))

interior_color_counts = df.groupBy("interior_color").count().withColumn(
    "percentage", F.round((F.col("count") / df.count()) * 100, 2))

# Showing the results
print("Exterior Color Distribution:")
exterior_color_counts.orderBy(F.desc("count")).show(truncate=False)

Exterior Color Distribution:
+--------------+------+----------+
|exterior_color|count |percentage|
+--------------+------+----------+
|White         |675979|22.53     |
|Black         |580148|19.34     |
|Other         |543638|18.12     |
|Silver        |384540|12.82     |
|Blue          |253263|8.44      |
|Red           |242331|8.08      |
|Gray          |231172|7.71      |
|Green         |23026 |0.77      |
|Mixed Colors  |19728 |0.66      |
|Brown         |12905 |0.43      |
|Orange        |11638 |0.39      |
|Gold          |10544 |0.35      |
|Beige         |5065  |0.17      |
|Yellow        |4855  |0.16      |
|Purple        |1208  |0.04      |
+--------------+------+----------+



In [None]:
print("Interior Color Distribution:")
interior_color_counts.orderBy(F.desc("count")).show(truncate=False)

Interior Color Distribution:
+--------------+-------+----------+
|interior_color|count  |percentage|
+--------------+-------+----------+
|Black         |1624033|54.13     |
|Other         |577578 |19.25     |
|Gray          |383966 |12.8      |
|Mixed Colors  |171212 |5.71      |
|White         |91545  |3.05      |
|Brown         |65943  |2.2       |
|Red           |34117  |1.14      |
|Silver        |24124  |0.8       |
|Blue          |22828  |0.76      |
|Green         |2048   |0.07      |
|Gold          |1193   |0.04      |
|Orange        |1133   |0.04      |
|Yellow        |134    |0.0       |
|Purple        |121    |0.0       |
|Beige         |65     |0.0       |
+--------------+-------+----------+



In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
df = df.drop('major_options', 'mileage')
df = df.na.drop()
print(f"DataFrame has {df.count()} rows and {len(df.columns)} columns")

DataFrame has 3000040 rows and 40 columns




---



## **XGB Regressor**



### **df_sample_without_description**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=6, desc="Progress") as pbar:

    df_sample = df.sample(fraction=0.033, seed=42)   # Randomly sample 100k records of data
    df_sample = df_sample.drop('description')
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Filling in missing values
    for col in df_sample.columns:
        if df_sample.schema[col].dataType.typeName() in ["double", "float", "int", "long"]:
            mean_value = df_sample.select(sql_mean(col)).first()[0]
            df_sample = df_sample.na.fill({col: mean_value})
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")

print(f"\nTrain_DF has {train_df.count()} rows and {len(train_df.columns)} columns")

Processing the data...


Progress: 100%|██████████| 6/6 [02:56<00:00, 29.36s/it]




Data preprocessing and splitting completed!

Train_DF has 79427 rows and 71 columns


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from sparkxgb import XGBoostRegressor
import time

# Model training
print("Training XGBoost model...")

xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",  # Use scaled features
    labelCol="price",               # Target column
    maxDepth=6,
    numRound=100,
    objective="reg:squarederror",   # Regression task
    treeMethod="hist",
)


# Before training
start_time = time.time()

# Training the model
model = xgb_regressor.fit(train_df)

# Making predictions
print("Making predictions...")
predictions = model.transform(test_df)

# Evaluating the model
print("Evaluating the model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

print(f"\nTrain size: {train_df.count()} samples")
print(f"Test size: {test_df.count()} samples")
print(f"\n\nR-Squared Score (Accuracy): {round(r2 * 100)}%\n")

Training XGBoost model...
Making predictions...
Evaluating the model...

Train size: 79427 samples
Test size: 19942 samples


R-Squared Score (Accuracy): 86%



In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")

Additional Metrics:
Mean Absolute Error: 2994
Mean Squared Error: 48370628
Root Mean Squared Error: 6955

Overall runtime: 16 minutes.


## **with Bert**
#### **df_sample_with_description -> Trained on BERT**

In [None]:
'''
# Login to Hugging Face
from huggingface_hub import login
# login("<-Key here->")

# Load necessary libraries for BERT embedding generation
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Check if CUDA (GPU) is available for BERT processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained DistilBERT tokenizer and model from HuggingFace
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Define a function to get DistilBERT embeddings (CLS token equivalent)
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return Vectors.dense(cls_embedding)

# Create a PySpark UDF for generating DistilBERT embeddings
bert_embeddings_udf = udf(get_bert_embeddings, VectorUDT())

# Apply DistilBERT embeddings to the 'description' column and add it as the last column
df_with_bert = df_sample_with_description.select("*", bert_embeddings_udf(col("description")).alias("bert_embeddings"))

# Drop the original 'description' column as it's no longer needed
df_with_bert = df_with_bert.drop("description")

# Verify the new DataFrame
df_with_bert.show(5)
'''

In [None]:
df_sample = df.sample(fraction=0.033, seed=42)

# Creating df_sample_with_description by applying the necessary transformations

# Converting to Lowercase the text of the 'description' column
df_sample_with_description = df_sample.withColumn("description", F.lower(F.col("description")))

# Removing punctuation
df_sample_with_description = df_sample_with_description.withColumn("description", F.regexp_replace(F.col("description"), "[^\w\s]", ""))

# Removing extra spaces
df_sample_with_description = df_sample_with_description.withColumn("description", F.regexp_replace(F.col("description"), "\s+", " "))

# Removing non-ASCII characters (optional)
df_sample_with_description = df_sample_with_description.withColumn("description", F.regexp_replace(F.col("description"), "[^\x00-\x7F]+", ""))

# Verifying the processed DataFrame
pd.set_option('display.max_columns', None)
df_sample_with_description_pandas = df_sample_with_description.limit(3).toPandas()
display(df_sample_with_description_pandas)

Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,description,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count
0,Gasoline,Coupe,Dallas,16.0,5,75235,check out this 2020 youll appreciate its safet...,5000.0,V8,White,True,16.0,54.3,25.0,460.0,Black,True,32.828899,188.5,2020-09-06,WHITE,-96.828903,Ford,4.0,Mustang,38690.0,0,4.5625,Park Cities Ford Lincoln,420.0,A,Automatic,Rear-Wheel Drive,107.1,81.9,2020,20.5,74.1,8.91,1
1,Gasoline,Sedan,Winston Salem,23.0,13,27103,discounts price includes cadillac bonus cash ...,2000.0,I4,White,True,17.4,57.2,32.0,237.0,Other,True,36.076302,193.8,2020-08-28,WHITE,-80.302803,Cadillac,5.0,CT5,50890.0,0,5.0,Flow Cadillac of Winston Salem,405.0,A,Automatic,Rear-Wheel Drive,116.0,80.3,2020,27.5,80.3,0.0,6
2,Gasoline,SUV / Crossover,Los Angeles,18.0,14,90007,additional info36l v6 transmission 9speed auto...,3600.0,V6,Other,True,19.4,70.7,27.0,310.0,Black,True,34.022701,204.3,2020-08-29,UNKNOWN,-118.278999,Chevrolet,7.0,Traverse,45120.0,0,5.0,Felix Chevrolet,266.0,A,Automatic,Front-Wheel Drive,120.9,78.6,2020,22.5,79.4,1.79,4


In [None]:
# Loading necessary libraries for BERT embedding generation
import torch
from transformers import BertTokenizer, BertModel
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
import warnings

# Logging to Hugging Face
from huggingface_hub import login
# login("<-Key here->")

# Suppressing warnings
warnings.filterwarnings("ignore")

# Checking if CUDA (GPU) is available for BERT processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading pre-trained BERT tokenizer and model from HuggingFace
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Defining a function to get BERT embeddings (CLS token equivalent)
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return Vectors.dense(cls_embedding)

# Creating a PySpark UDF for generating BERT embeddings
bert_embeddings_udf = udf(get_bert_embeddings, VectorUDT())

# Applying BERT embeddings to the 'description' column and add it as the last column
df_with_bert = df_sample_with_description.select("*", bert_embeddings_udf(col("description")).alias("bert_embeddings"))

# Dropping the original 'description' column as it's no longer needed
df_with_bert = df_with_bert.drop("description")

# Verifying the new DataFrame
print("\n")
pd.set_option('display.max_columns', None)
df_with_bert_pandas = df_with_bert.limit(3).toPandas()
display(df_with_bert_pandas)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]





Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count,bert_embeddings
0,Gasoline,Coupe,Dallas,16.0,5,75235,5000.0,V8,White,True,16.0,54.3,25.0,460.0,Black,True,32.828899,188.5,2020-09-06,WHITE,-96.828903,Ford,4.0,Mustang,38690.0,0,4.5625,Park Cities Ford Lincoln,420.0,A,Automatic,Rear-Wheel Drive,107.1,81.9,2020,20.5,74.1,8.91,1,"[-0.7845814228057861, 0.13907955586910248, 0.2..."
1,Gasoline,Sedan,Winston Salem,23.0,13,27103,2000.0,I4,White,True,17.4,57.2,32.0,237.0,Other,True,36.076302,193.8,2020-08-28,WHITE,-80.302803,Cadillac,5.0,CT5,50890.0,0,5.0,Flow Cadillac of Winston Salem,405.0,A,Automatic,Rear-Wheel Drive,116.0,80.3,2020,27.5,80.3,0.0,6,"[-0.737446665763855, 0.24113748967647552, 0.19..."
2,Gasoline,SUV / Crossover,Los Angeles,18.0,14,90007,3600.0,V6,Other,True,19.4,70.7,27.0,310.0,Black,True,34.022701,204.3,2020-08-29,UNKNOWN,-118.278999,Chevrolet,7.0,Traverse,45120.0,0,5.0,Felix Chevrolet,266.0,A,Automatic,Front-Wheel Drive,120.9,78.6,2020,22.5,79.4,1.79,4,"[-0.3749138116836548, 0.17043828964233398, 0.4..."


In [None]:
print(f"The new 'df_with_bert' has {df_with_bert.count()} rows and {len(df_with_bert.columns)} columns")

The new 'df_with_bert' has 99369 rows and 40 columns


In [None]:
df_sample = df_with_bert
print(f"The 'df_sample' also has same : {df_sample.count()} rows and {len(df_sample.columns)} columns")
df_sample.cache()
df_sample = df_sample.repartition(100)

The 'df_sample' also has same : 99369 rows and 40 columns




---



In [None]:
import warnings
import time
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

# Before training
start_time = time.time()

print("Processing the data...")
with tqdm(total=5, desc="Progress") as pbar:

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns and col != 'bert_embeddings']
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns + ['bert_embeddings']  # Include the BERT embeddings
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Filling in missing values for numerical columns
    numerical_columns = [col for col in df_sample.columns if df_sample.schema[col].dataType.typeName() in ["double", "float", "int", "long"]]
    mean_values = {col: df_sample.select(sql_mean(col)).first()[0] for col in numerical_columns}
    df_sample = df_sample.na.fill(mean_values)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("\nData preprocessing and splitting completed!")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")
print(f"\n\nTrain_DF has {train_df.count()} rows and {len(train_df.columns)} columns")

Processing the data...
Progress: 100%|██████████| 5/5 [2:37:47<00:00, 1893.48s/it]

Data preprocessing and splitting completed!

Overall runtime: 150 minutes.


Train_DF has 79477 rows and 72 columns


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 3 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(3).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count,bert_embeddings,fuel_type_indexed,fuel_type_encoded,body_type_indexed,body_type_encoded,city_indexed,city_encoded,dealer_zip_indexed,dealer_zip_encoded,engine_type_indexed,engine_type_encoded,exterior_color_indexed,exterior_color_encoded,interior_color_indexed,interior_color_encoded,listed_date_indexed,listed_date_encoded,listing_color_indexed,listing_color_encoded,make_name_indexed,make_name_encoded,model_name_indexed,model_name_encoded,sp_name_indexed,sp_name_encoded,transmission_indexed,transmission_encoded,transmission_display_indexed,transmission_display_encoded,wheel_system_display_indexed,wheel_system_display_encoded,features,scaled_features
0,Gasoline,Sedan,Lexington,22.690001,208,27295,2000.0,I4,White,False,12.0,53.3,29.469999,110.0,Gray,False,35.946098,174.7,2020-02-15,WHITE,-80.310699,Ford,5.0,Escort,3950.0,0,4.875,Creekside Automotive,125.0,A,Automatic,Front-Wheel Drive,98.4,67.0,1997,26.08,76.5,11.29,0,"[-0.3094957172870636, -0.03846015781164169, 0....",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",38.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1075.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",128.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",634.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12513.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(22.690000534057617, 208.0, 2000.0, 0.0, 12.0,...","[0.0013759557325469349, 1.213406154648635, -0...."
1,Gasoline,Pickup Truck,Pell City,16.0,13,35125,3500.0,V6,White,True,26.0,77.2,22.0,375.0,Black,True,33.629601,231.9,2020-08-29,WHITE,-86.279297,Ford,6.0,F-150,55600.0,0,4.416667,Town & Country Ford Pell City,400.0,A,Automatic,Four-Wheel Drive,145.0,96.8,2020,19.0,87.5,8.91,0,"[-0.44042280316352844, 0.12084095180034637, -0...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1382.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1855.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3497.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(16.0, 13.0, 3500.0, 1.0, 26.0, 77.2, 22.0, 37...","[-0.8336560110820449, -0.5800079669950434, 0.4..."
2,Gasoline,Sedan,Westfield,18.0,38,1085,3000.0,V6,Other,False,18.0,56.1,25.0,155.0,Other,False,42.162701,197.6,2020-08-02,RED,-72.7714,Ford,6.0,Taurus,3000.0,0,5.0,Deals 4 Wheels Inc,185.0,A,Automatic,Front-Wheel Drive,108.5,73.0,2002,21.5,81.1,11.18,1,"[-0.5281291604042053, 0.03638622537255287, 0.3...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",933.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3540.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",39.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",137.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",15967.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(18.0, 38.0, 3000.0, 0.0, 18.0, 56.1, 25.0, 15...","[-0.5840201664917221, -0.3500830796048282, 0.0..."


In [None]:
# Seting Spark broadcast timeout for larger datasets
spark.conf.set("spark.sql.broadcastTimeout", "600")

# Coalescing into a single partition (to write as a single file)
test_df_partition = test_df.coalesce(1)
train_df_partition = train_df.coalesce(1)

# Defining the paths in Google Drive where the DataFrames will be saved as Parquet files
output_path_train = '/content/drive/MyDrive/Big Data Analytics - Project/Experimental EDA/Train_df'
output_path_test = '/content/drive/MyDrive/Big Data Analytics - Project/Experimental EDA/Test_df'

# Writing the Train DataFrame as a Parquet file in the specified Google Drive location
train_df_partition.write.format("parquet") \
    .mode("overwrite") \
    .save(output_path_train)

print("Train DataFrame has been saved as a single Parquet file in Google Drive.")

# Writing the Test DataFrame as a Parquet file in the specified Google Drive location
test_df_partition.write.format("parquet") \
    .mode("overwrite") \
    .save(output_path_test)

print("Test DataFrame has been saved as a single Parquet file in Google Drive.")

Train DataFrame has been saved as a single Parquet file in Google Drive.
Test DataFrame has been saved as a single Parquet file in Google Drive.


In [None]:
loaded_train_df = spark.read.format("parquet").load(output_path_train)
loaded_test_df = spark.read.format("parquet").load(output_path_test)

# Subtracting rows in both directions to check for differences
train_diff_1 = train_df.subtract(loaded_train_df)
train_diff_2 = loaded_train_df.subtract(train_df)

test_diff_1 = test_df.subtract(loaded_test_df)
test_diff_2 = loaded_test_df.subtract(test_df)

# Checking if there are any differences in Train DataFrames
if train_diff_1.count() == 0 and train_diff_2.count() == 0:
    print("Every row in the Train DataFrame is identical to the loaded Train DataFrame.")
else:
    print("There are differences in the Train DataFrame rows.")

# Checking if there are any differences in Test DataFrames
if test_diff_1.count() == 0 and test_diff_2.count() == 0:
    print("Every row in the Test DataFrame is identical to the loaded Test DataFrame.")
else:
    print("There are differences in the Test DataFrame rows.")

Every row in the Train DataFrame is identical to the loaded Train DataFrame.
Every row in the Test DataFrame is identical to the loaded Test DataFrame.




---



In [None]:
# Defining the paths in Google Drive where the DataFrames will be saved as Parquet files
output_path_train = '/content/drive/MyDrive/Big Data Analytics - Project/Experimental EDA/Train_df'
output_path_test = '/content/drive/MyDrive/Big Data Analytics - Project/Experimental EDA/Test_df'

# Loading the saved DataFrames from the Parquet files in Google Drive
train_df = spark.read.format("parquet").load(output_path_train)
test_df = spark.read.format("parquet").load(output_path_test)

print("Loaded train_df and test_df from the saved Parquet files.\n")

# Verifying the loaded DataFrames
print(f"Loaded Train DataFrame has {train_df.count()} rows and {len(train_df.columns)} columns.")
print(f"Loaded Test DataFrame has {test_df.count()} rows and {len(test_df.columns)} columns.")


Loaded train_df and test_df from the saved Parquet files.

Loaded Train DataFrame has 79477 rows and 72 columns.
Loaded Test DataFrame has 19892 rows and 72 columns.


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 3 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(3).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count,bert_embeddings,fuel_type_indexed,fuel_type_encoded,body_type_indexed,body_type_encoded,city_indexed,city_encoded,dealer_zip_indexed,dealer_zip_encoded,engine_type_indexed,engine_type_encoded,exterior_color_indexed,exterior_color_encoded,interior_color_indexed,interior_color_encoded,listed_date_indexed,listed_date_encoded,listing_color_indexed,listing_color_encoded,make_name_indexed,make_name_encoded,model_name_indexed,model_name_encoded,sp_name_indexed,sp_name_encoded,transmission_indexed,transmission_encoded,transmission_display_indexed,transmission_display_encoded,wheel_system_display_indexed,wheel_system_display_encoded,features,scaled_features
0,Gasoline,SUV / Crossover,Hamilton,26.0,294,13346,1400.0,I4,Black,True,14.0,65.3,32.0,138.0,Black,False,42.8451,168.4,2019-11-21,BLACK,-75.558403,Buick,5.0,Encore,19490.0,982,4.0,Den Kelly Chevrolet Buick GMC,148.0,A,Automatic,All-Wheel Drive,100.6,69.9,2016,29.0,76.6,9.73,9,"[-0.4401452839374542, 0.33145177364349365, 0.3...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",595.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3305.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",225.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",51.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10154.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(26.0, 294.0, 1400.0, 1.0, 14.0, 65.3, 32.0, 1...","[0.4145232118695689, 2.0043477672709753, -1.17..."
1,Gasoline,Sedan,Picayune,28.0,91,39466,2000.0,I4,Gray,True,14.0,56.5,37.0,147.0,Black,False,30.510099,179.9,2020-06-12,GRAY,-89.665497,Hyundai,5.0,Elantra,16790.0,446,3.928571,Nissan of Picayune,132.0,A,6-Speed Automatic,Front-Wheel Drive,106.3,70.9,2017,32.5,77.9,10.15,11,"[-0.44044071435928345, 0.2960529327392578, 0.4...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1437.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1980.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",80.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",27.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3359.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(28.0, 91.0, 2000.0, 1.0, 14.0, 56.5, 37.0, 14...","[0.6641590564598917, 0.13735768166242798, -0.7..."
2,Gasoline,SUV / Crossover,North Charleston,15.0,68,29418,3600.0,V6,White,False,22.0,69.9,22.0,281.0,Other,False,32.887001,203.7,2020-07-04,WHITE,-80.067703,Chevrolet,8.0,Traverse,13900.0,2093,4.75,Burke Auto LLC,266.0,A,6-Speed Automatic,Front-Wheel Drive,118.9,78.5,2016,18.5,78.1,11.67,6,"[-0.558530330657959, -0.2980552613735199, 0.20...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",617.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1844.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",59.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",25.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",15427.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(15.0, 68.0, 3600.0, 0.0, 22.0, 69.9, 22.0, 28...","[-0.9584739333772062, -0.07417321473656999, 0...."


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from sparkxgb import XGBoostRegressor
import time

# Model training
print("Training XGBoost model...")

xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",  # Use scaled features including BERT embeddings
    labelCol="price",               # Target column
    maxDepth=6,
    numRound=100,
    objective="reg:squarederror",   # Regression task
    treeMethod="hist",
)

# Before training
start_time = time.time()

# Training the model
model = xgb_regressor.fit(train_df)

# Making predictions
print("Making predictions...")
predictions = model.transform(test_df)

# Evaluating the model
print("Evaluating the model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

print(f"\nTrain size: {train_df.count()} samples")
print(f"Test size: {test_df.count()} samples")
print(f"\n\nR-Squared Score (Accuracy): {round(r2 * 100)}%\n")


Training XGBoost model...
Making predictions...
Evaluating the model...

Train size: 79477 samples
Test size: 19892 samples


R-Squared Score (Accuracy): 87%



In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")

Additional Metrics:
Mean Absolute Error: 3007
Mean Squared Error: 47503276
Root Mean Squared Error: 6892

Overall runtime: 33 minutes.




---



# **Decision Trees Implementation**

### **df_sample_without_description**

In [None]:
# dropping description coloumn and splitting
df= df.drop('description')
print(f"After 'description' is removed, the DataFrame has {df.count()} rows and {len(df.columns)} columns")

After 'description' is removed, the DataFrame has 3000040 rows and 39 columns


In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean as sql_mean
import pyspark.sql.functions as F

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")
with tqdm(total=5, desc="Progress") as pbar:

    df_sample = df.sample(fraction=0.033, seed=42)  # Randomly sample 100k records
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Assembling features
    num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df_sample)
    df_sample = pipeline_model.transform(df_sample)
    pbar.update(1)

    # Splitting the data
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

print("\n\nData preprocessing and splitting completed!")


Processing the data...
Progress: 100%|██████████| 5/5 [00:58<00:00, 11.69s/it]

Data preprocessing and splitting completed!


In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time

# Model training
print("Training GBTRegressor model...")


# Using GBTRegressor with tuned parameters
gbt_regressor = GBTRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxIter=100,
    maxDepth=5,
    seed=42,
  # parallelism=4
)

# Training the model
model = gbt_regressor.fit(train_df)

from pyspark.ml.regression import GBTRegressor

# Making predictions
print("Making predictions...")
predictions = model.transform(test_df)

# Evaluating the model
print("Evaluating the model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

print(f"\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

print(f"\n\nR-Squared Score (Accuracy): {round(r2 * 100)}%\n")

Training GBTRegressor model...
Making predictions...
Evaluating the model...

Train size: 79,427 samples
Test size: 19,942 samples

R-Squared Score (Accuracy): 79%


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Additional Metrics:
Mean Absolute Error: 3471
Mean Squared Error: 73205957
Root Mean Squared Error: 8556


## **with Bert**
#### **df_sample_with_description -> Trained on BERT**

In [None]:
%pip install transformers torch



In [None]:
df_sample = df.sample(fraction=0.033, seed=42)  # Randomly sample 100k records

# Lowercase the text
df_sample = df_sample.withColumn("description", F.lower(F.col("description")))

# Removing punctuation
df_sample = df_sample.withColumn("description", F.regexp_replace(F.col("description"), "[^\w\s]", ""))

# Removing extra spaces
df_sample = df_sample.withColumn("description", F.regexp_replace(F.col("description"), "\s+", " "))

# Removing non-ASCII characters (optional)
df_sample = df_sample.withColumn("description", F.regexp_replace(F.col("description"), "[^\x00-\x7F]+", ""))

In [None]:
from huggingface_hub import login

# Pasting my token here
# login("<-Key here->")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import warnings
import time

# Suppressing warnings
warnings.filterwarnings("ignore")

# Checking if CUDA (GPU) is available for BERT processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading pre-trained DistilBERT tokenizer and model from HuggingFace
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Defining a function to get DistilBERT embeddings (CLS token equivalent)
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return Vectors.dense(cls_embedding)

# Creating a PySpark UDF for generating DistilBERT embeddings
bert_embeddings_udf = udf(get_bert_embeddings, VectorUDT())

# Applying DistilBERT embeddings to the 'description' column and adding it as the last column
df_with_bert = df_sample.select("*", bert_embeddings_udf(col("description")).alias("bert_embeddings"))
df_with_bert = df_with_bert.drop("description")




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Displaying the DataFrame

pd.set_option('display.max_columns', None)
df_with_bert_pandas = df_with_bert.limit(3).toPandas()
display(df_with_bert_pandas)

Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listed_date,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,year,combined_fuel_economy,legroom,log_mileage,major_options_count,bert_embeddings
0,Gasoline,Coupe,Dallas,16.0,5,75235,5000.0,V8,White,True,16.0,54.3,25.0,460.0,Black,True,32.828899,188.5,2020-09-06,WHITE,-96.828903,Ford,4.0,Mustang,38690.0,0,4.5625,Park Cities Ford Lincoln,420.0,A,Automatic,Rear-Wheel Drive,107.1,81.9,2020,20.5,74.1,8.91,1,"[-0.3277852535247803, -0.11871742457151413, 0...."
1,Gasoline,Sedan,Winston Salem,23.0,13,27103,2000.0,I4,White,True,17.4,57.2,32.0,237.0,Other,True,36.076302,193.8,2020-08-28,WHITE,-80.302803,Cadillac,5.0,CT5,50890.0,0,5.0,Flow Cadillac of Winston Salem,405.0,A,Automatic,Rear-Wheel Drive,116.0,80.3,2020,27.5,80.3,0.0,6,"[-0.14011572301387787, 0.012744560837745667, 0..."
2,Gasoline,SUV / Crossover,Los Angeles,18.0,14,90007,3600.0,V6,Other,True,19.4,70.7,27.0,310.0,Black,True,34.022701,204.3,2020-08-29,UNKNOWN,-118.278999,Chevrolet,7.0,Traverse,45120.0,0,5.0,Felix Chevrolet,266.0,A,Automatic,Front-Wheel Drive,120.9,78.6,2020,22.5,79.4,1.79,4,"[-0.6508010029792786, 0.03276926279067993, 0.0..."




---



In [None]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Identifying different types of columns
print("Identifying column types...")
numeric_cols = []
categorical_cols = []
boolean_cols = []

for field in df_with_bert.schema.fields:
    if field.name != "price":  # Excluding the target variable
        if field.dataType.typeName() in ["double", "int", "long", "float"]:
            numeric_cols.append(field.name)
        elif field.dataType.typeName() == "string":
            categorical_cols.append(field.name)
        elif field.dataType.typeName() == "boolean":
            boolean_cols.append(field.name)

print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")
print(f"Boolean columns: {boolean_cols}")

print('\n\nPreparing features...')
stages = []

# Handling categorical columns
for cat_col in categorical_cols:
    # String Indexer
    indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_indexed", handleInvalid="keep")
    # One Hot Encoder
    encoder = OneHotEncoder(inputCol=f"{cat_col}_indexed", outputCol=f"{cat_col}_encoded")
    stages += [indexer, encoder]

# Collecting all feature columns
feature_cols = (
    numeric_cols +
    boolean_cols +
    [f"{cat_col}_encoded" for cat_col in categorical_cols] +
    ["bert_embeddings"]
)

# Adding VectorAssembler to the pipeline stages
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
stages += [assembler]

# Creating and fiting the pipeline
pipeline = Pipeline(stages=stages)
df_assembled = pipeline.fit(df_with_bert).transform(df_with_bert)

Identifying column types...
Numeric columns: ['city_fuel_economy', 'engine_displacement', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'horsepower', 'latitude', 'length', 'longitude', 'maximum_seating', 'mileage', 'seller_rating', 'torque', 'wheelbase', 'width', 'combined_fuel_economy', 'legroom', 'log_mileage']
Categorical columns: ['fuel_type', 'body_type', 'city', 'dealer_zip', 'engine_type', 'exterior_color', 'interior_color', 'listed_date', 'listing_color', 'major_options', 'make_name', 'model_name', 'sp_name', 'transmission', 'transmission_display', 'wheel_system_display']
Boolean columns: ['franchise_dealer', 'is_new']


Preparing features...


In [None]:
# Spliting the data into training and testing sets
print("Splitting data into train and test sets...")
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Starting tracking overall run time
start_time = time.time()

# Training GBTRegressor model
print("Training GBTRegressor model...")

# Initializing GBTRegressor with tuned parameters
gbt_regressor = GBTRegressor(
    featuresCol="features",  # Using the combined features from VectorAssembler
    labelCol="price",        # The target column
    maxIter=100,             # Number of iterations (trees)
    maxDepth=5,              # Maximum tree depth
    seed=42                  # For reproducibility
)

# Training the model
model = gbt_regressor.fit(train_data)

# Making predictions on the test set
print("Making predictions...")
predictions = model.transform(test_data)

Splitting data into train and test sets...
Training GBTRegressor model...
Making predictions...


In [None]:
new_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/BERT_experimental_feature_model"

# Saving the trained model to the new path
model.save(new_model_path)

print(f"Model saved successfully at {new_model_path}")


Model saved successfully at /content/drive/MyDrive/Big Data Analytics - Project/models/BERT_experimental_feature_model


In [None]:
# Evaluating the model using R-squared (R²)
print("Evaluating the model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"\n\nR-Squared Score: {round(r2 * 100, 2)}%")
#print(f"\n\nR-Squared Score (Accuracy) after training description with BERT: : {round(r2 * 100, 2)}%")

# Showing dataset sizes and evaluation metrics
print(f"\nTrain size: {train_data.count():,} samples")
print(f"Test size: {test_data.count():,} samples")

# Calculating and showing the total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime, 2)} minutes.")

Evaluating the model...


R-Squared Score: 83.8%

Train size: 80,535 samples
Test size: 19,902 samples

Overall runtime: 343.43 minutes.


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
import time

# Before training
start_time = time.time()

# Mean Absolute Error (MAE)
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

# Root Mean Squared Error (RMSE)
rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Mean Squared Error (MSE)
mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"\nMean Absolute Error (MAE): {round(mae, 2)}")
print(f"Root Mean Squared Error (RMSE): {round(rmse, 2)}")
print(f"Mean Squared Error (MSE): {round(mse, 2)}")

# Calculate total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")

Additional Metrics:
Mean Absolute Error (MAE): 3948.79
Root Mean Squared Error (RMSE): 7257.57
Mean Squared Error (MSE): 52672255.29

Overall runtime: 126 minutes.




---



In [None]:
'''
# Path to the Parquet file in ADLS (Azure Data Lake Storage) using abfs:// protocol
parquet_file_path = "abfss://usedcar-dataset@usedcarstorageacct.dfs.core.windows.net/Intermediate_D"

# Load the Parquet file into a Spark DataFrame
df = spark.read.parquet(parquet_file_path)

# Display the number of rows and some sample data
print(f"Number of rows: {df.count()}")
df.show(5)

from azure.storage.blob import BlobServiceClient

# Set up your Azure Storage account credentials
account_name = "usedcarstorageacct"
account_key = azure_account_key
container_name = "usedcar-dataset"

# Create the BlobServiceClient object
connection_string = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get the ContainerClient to list blobs
container_client = blob_service_client.get_container_client(container_name)

# List blobs in the container
print("Listing blobs in the container...")
blobs_list = container_client.list_blobs()
for blob in blobs_list:
    print(blob.name)

from azure.storage.blob import BlobServiceClient
import os

# Azure Storage account credentials
account_name = "usedcarstorageacct"
account_key = azure_account_key
container_name = "usedcar-dataset"
parquet_file_name = "Intermediate_DF/intermediate_processed_used_cars_data"  # Path to the file in Azure Blob
local_file_name = "/content/Intermediate_DF"

# Create the BlobServiceClient object
connection_string = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get the BlobClient for the specific file
blob_client = blob_service_client.get_blob_client(container=container_name, blob=parquet_file_name)

# Download the blob to a local file
print(f"Downloading {parquet_file_name} from Azure Blob Storage to {local_file_name}...")

# Write the file to the local path
with open(local_file_name, "wb") as file:
    file.write(blob_client.download_blob().readall())

print(f"File {local_file_name} downloaded successfully.")

# Load the Parquet file into a Spark DataFrame
df = spark.read.parquet("/content/Intermediate_DF")

# Show a few rows to verify the DataFrame
df.show(5)
'''