In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check
packages = [
    "tqdm",
    "dask",
    "nltk",
    "scikit-learn",
    "numpy",
    "pyspark",
    "gdown"
]

# Checking and installing the packages
for package in packages:
    check_and_install_package(package)



tqdm is already installed.

dask is already installed.

nltk is already installed.

scikit-learn is NOT installed. Installing now...
scikit-learn installation completed.

numpy is already installed.

pyspark is already installed.

gdown is already installed.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RandomForestModel") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .getOrCreate()

# Verifying Spark session creation
print(f"Spark session started with version: {spark.version}")

Spark session started with version: 3.5.3


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying the first 5 rows
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,Sedan,Ridgeland,24.0,40,39157,2000.0,I4,Silver,True,17.4,56.3,35.0,255.0,Black,True,32.3755,184.5,SILVER,-90.148399,Mercedes-Benz,5.0,C-Class,48315.0,0,4.21875,Mercedes-Benz of Jackson,273.0,A,Automatic,Rear-Wheel Drive,111.8,79.4,2020,29.5,76.9,1.79,8,-0.06,0.00603,2,8,2020,0,33,41,35
1,Gasoline,Sedan,Costa Mesa,24.0,41,92626,2000.0,I4,White,False,18.0,58.2,34.0,248.0,Black,False,33.687,194.6,WHITE,-117.918999,BMW,5.0,5 Series,30998.0,317,5.0,CarMax Costa Mesa - Now offering Curbside Pick...,258.0,A,Automatic,Rear-Wheel Drive,117.1,83.7,2017,29.0,77.9,10.51,7,0.0,-0.00031,2,8,2020,3,24,38,33
2,Gasoline,Coupe,Westfield,16.0,21,46074,5000.0,V8,Other,False,16.0,54.3,25.0,460.0,Black,False,40.0373,188.5,GRAY,-86.165901,Ford,4.0,Mustang,33500.0,766,4.271429,Unlimited Motors - Westfield,420.0,A,Automatic,Rear-Wheel Drive,107.1,81.9,2019,20.5,74.1,10.19,4,3.79,3.43979,21,8,2020,1,28,46,29
3,Gasoline,SUV / Crossover,Hornell,26.0,50,14843,1500.0,I3,Black,True,14.7,66.1,31.0,180.0,Other,True,42.3661,180.5,BLACK,-77.678703,Ford,5.0,Escape,26499.0,0,3.571429,"Simmons Rockwell Ford, Inc.",265.22,A,Automatic,All-Wheel Drive,106.7,85.6,2020,28.5,83.1,1.39,8,0.84,-1e-05,22,7,2020,0,32,39,34
4,Gasoline,SUV / Crossover,Bay Minette,18.0,21,36507,3000.0,V6,Gray,True,21.5,73.6,22.0,260.0,Black,True,30.884501,188.4,UNKNOWN,-87.770699,Jeep,5.0,Wrangler Unlimited,38984.0,0,3.3,Chuck Stevens Dodge Chrysler Jeep Ram,442.0,A,8-Speed Automatic,Four-Wheel Drive,118.4,73.8,2020,20.0,79.5,2.48,5,0.01,0.22947,20,8,2020,0,31,41,36


## **Random Forest Regressor**

## **Predicting on DF with a size of  `~100k records`**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder, Imputer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import mean as sql_mean, log
import pyspark.sql.functions as F
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=7, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.033, seed=42)  # Random sampling 100k records
    pbar.update(1)

    # Removing rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample.filter(F.col("price") > 0)

    # Log transforming the target variable
    df_sample = df_sample.withColumn("log_price", log("price"))
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price', 'log_price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline (to scale the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    try:
        pipeline_model = pipeline.fit(df_sample)
        df_sample = pipeline_model.transform(df_sample)
        pbar.update(1)
    except Exception as e:
        print(f"Error during pipeline fit: {e}")
        pbar.update(1)

    # Splitting the data into training and test sets
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Defining the RandomForestRegressor model
    rf = RandomForestRegressor(
      featuresCol="scaled_features",
      labelCol="log_price",
      numTrees=50,
      maxDepth=10,
      minInstancesPerNode=10,
      seed=42
    )

    # Fitting the model to the training data
    try:
        model = rf.fit(train_df)
        pbar.update(1)
    except Exception as e:
        print(f"Error during model training: {e}")
        pbar.update(1)

# Making predictions
print("Making predictions...")
try:
    predictions = model.transform(test_df)
    predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))
except Exception as e:
    print(f"Error during prediction: {e}")

# Evaluating the model
print("Evaluating the model...")
try:
    evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")
except Exception as e:
    print(f"Error during evaluation: {e}")

# Displaying results
print(f"\n\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 7/7 [50:48<00:00, 435.51s/it]


Making predictions...
Evaluating the model...


R-Squared Score (Accuracy): 75.25%


Train size: 79,346 samples
Test size: 19,771 samples

Overall runtime: 52 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3517
Mean Squared Error: 88738726
Root Mean Squared Error: 9420




---



## **Predicting on DF with a size of `~200k records`**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder, Imputer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import mean as sql_mean, log
import pyspark.sql.functions as F
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=7, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.066, seed=42)  # Random sampling 200k records
    pbar.update(1)

    # Removing rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample.filter(F.col("price") > 0)

    # Log transforming the target variable
    df_sample = df_sample.withColumn("log_price", log("price"))
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price', 'log_price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline (to scale the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    try:
        pipeline_model = pipeline.fit(df_sample)
        df_sample = pipeline_model.transform(df_sample)
        pbar.update(1)
    except Exception as e:
        print(f"Error during pipeline fit: {e}")
        pbar.update(1)

    # Splitting the data into training and test sets
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Defining the RandomForestRegressor model
    rf = RandomForestRegressor(
      featuresCol="scaled_features",
      labelCol="log_price",
      numTrees=50,
      maxDepth=10,
      minInstancesPerNode=10,
      seed=42
    )

    # Fitting the model to the training data
    try:
        model = rf.fit(train_df)
        pbar.update(1)
    except Exception as e:
        print(f"Error during model training: {e}")
        pbar.update(1)

# Making predictions
print("Making predictions...")
try:
    predictions = model.transform(test_df)
    predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))
except Exception as e:
    print(f"Error during prediction: {e}")

# Evaluating the model
print("Evaluating the model...")
try:
    evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")
except Exception as e:
    print(f"Error during evaluation: {e}")

# Displaying results
print(f"\n\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 7/7 [2:05:24<00:00, 1074.99s/it]


Making predictions...
Evaluating the model...


R-Squared Score (Accuracy): 82.96%


Train size: 158,449 samples
Test size: 39,727 samples

Overall runtime: 130 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3545
Mean Squared Error: 59091248
Root Mean Squared Error: 7687




---



## **Predicting on DF with a size of `300k records`**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder, Imputer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import mean as sql_mean, log
import pyspark.sql.functions as F
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=7, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.1, seed=42)  # Random sampling 300k records
    pbar.update(1)

    # Removing rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample.filter(F.col("price") > 0)

    # Log transforming the target variable
    df_sample = df_sample.withColumn("log_price", log("price"))
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price', 'log_price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline (to scale the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    try:
        pipeline_model = pipeline.fit(df_sample)
        df_sample = pipeline_model.transform(df_sample)
        pbar.update(1)
    except Exception as e:
        print(f"Error during pipeline fit: {e}")
        pbar.update(1)

    # Splitting the data into training and test sets
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Defining the RandomForestRegressor model
    rf = RandomForestRegressor(
      featuresCol="scaled_features",
      labelCol="log_price",
      numTrees=50,
      maxDepth=10,
      minInstancesPerNode=10,
      seed=42
    )

    # Fitting the model to the training data
    try:
        model = rf.fit(train_df)
        pbar.update(1)
    except Exception as e:
        print(f"Error during model training: {e}")
        pbar.update(1)

# Making predictions
print("Making predictions...")
try:
    predictions = model.transform(test_df)
    predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))
except Exception as e:
    print(f"Error during prediction: {e}")

# Evaluating the model
print("Evaluating the model...")
try:
    evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")
except Exception as e:
    print(f"Error during evaluation: {e}")

# Displaying results
print(f"\n\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 7/7 [3:20:14<00:00, 1716.31s/it]
Making predictions...
Evaluating the model...

R-Squared Score (Accuracy): 84.61%

Train size: 240,222 samples
Test size: 60,280 samples

Overall runtime: 206 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3594
Mean Squared Error: 48734361
Root Mean Squared Error: 6981




---



## **Predicting on DF with a size of `600k records`**

In [None]:
import warnings
from tqdm import tqdm
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder, Imputer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import mean as sql_mean, log
import pyspark.sql.functions as F
import time

# Ignore warnings
warnings.filterwarnings('ignore')

# Starting to track overall runtime
start_time = time.time()

with tqdm(total=7, desc="Processing and Training") as pbar:

    df_sample = df.sample(fraction=0.2, seed=42)  # Random sampling 600k records
    pbar.update(1)

    # Removing rows where 'price' is <= 0 (to avoid issues with log transformation)
    df_sample = df_sample.filter(F.col("price") > 0)

    # Log transforming the target variable
    df_sample = df_sample.withColumn("log_price", log("price"))
    pbar.update(1)

    # Handling categorical columns
    cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]
    stages = []
    for col_name in cat_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
        encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
        stages += [indexer, encoder]
    pbar.update(1)

    # Converting 'franchise_dealer' to numeric
    df_sample = df_sample.withColumn("franchise_dealer", F.col("franchise_dealer").cast("int"))

    # Assembling features (ensure all columns used in 'VectorAssembler' are numeric)
    num_columns = [col for col in df_sample.columns if col not in ['price', 'log_price'] + cat_columns]
    encoded_columns = [f"{col}_encoded" for col in cat_columns]
    feature_columns = num_columns + encoded_columns
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    stages += [assembler]
    pbar.update(1)

    # Adding scaling to the pipeline (to scale the assembled feature vectors)
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
    stages += [scaler]

    # Creating and applying the pipeline
    pipeline = Pipeline(stages=stages)
    try:
        pipeline_model = pipeline.fit(df_sample)
        df_sample = pipeline_model.transform(df_sample)
        pbar.update(1)
    except Exception as e:
        print(f"Error during pipeline fit: {e}")
        pbar.update(1)

    # Splitting the data into training and test sets
    train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
    pbar.update(1)

    # Defining the RandomForestRegressor model
    rf = RandomForestRegressor(
      featuresCol="scaled_features",
      labelCol="log_price",
      numTrees=50,
      maxDepth=10,
      minInstancesPerNode=10,
      seed=42
    )

    # Fitting the model to the training data
    try:
        model = rf.fit(train_df)
        pbar.update(1)
    except Exception as e:
        print(f"Error during model training: {e}")
        pbar.update(1)

# Making predictions
print("Making predictions...")
try:
    predictions = model.transform(test_df)
    predictions = predictions.withColumn("exp_prediction", F.exp("prediction"))
except Exception as e:
    print(f"Error during prediction: {e}")

# Evaluating the model
print("Evaluating the model...")
try:
    evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print(f"\n\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")
except Exception as e:
    print(f"Error during evaluation: {e}")

# Displaying results
print(f"\n\nTrain size: {train_df.count():,} samples")
print(f"Test size: {test_df.count():,} samples")

# Calculating total runtime
end_time = time.time()
total_runtime = (end_time - start_time) / 60
print(f"\nOverall runtime: {round(total_runtime)} minutes.")


Processing and Training: 100%|██████████| 7/7 [8:22:00<00:00, 4302.89s/it]
Making predictions...
Evaluating the model...

R-Squared Score (Accuracy): 86.35%

Train size: 480,411 samples
Test size: 120,366 samples

Overall runtime: 515 minutes.


In [None]:
# Calculating additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="exp_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

print("Additional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")

Additional Metrics:
Mean Absolute Error: 3488
Mean Squared Error: 44980420
Root Mean Squared Error: 6707
