In [None]:
import importlib
import subprocess
import sys
import gc

def check_and_install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"\n{package_name} is already installed.")
    except ImportError:
        print(f"\n{package_name} is NOT installed. Installing now...")
        if version:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={version}"])
        else:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installation completed.")

# List of packages to check along with specific versions if necessary
packages = [
    {"name": "tqdm", "version": None},
    {"name": "pyspark", "version": "3.1.1"},
    {"name": "gdown", "version": None},
    {"name": "numpy", "version": "1.22.4"},
    {"name": "xgboost", "version": None},
    {"name": "sparkxgb", "version": None},
]

# Checking and installing packages
for package in packages:
    check_and_install_package(package["name"], package["version"])


tqdm is already installed.

pyspark is already installed.

gdown is already installed.

numpy is already installed.

xgboost is already installed.

sparkxgb is already installed.


In [None]:
!pip install xgboost==1.5.0

Collecting xgboost==1.5.0
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.5/173.5 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.2
    Uninstalling xgboost-2.1.2:
      Successfully uninstalled xgboost-2.1.2
Successfully installed xgboost-1.5.0


In [None]:
import xgboost as xgb
import sparkxgb

print("XGBoost version:", xgb.__version__)
print("SparkXGB version:", sparkxgb.__version__)


XGBoost version: 1.5.0
SparkXGB version: 0.90


In [None]:
!pip install numpy==1.22.4



In [None]:
import numpy
print(numpy.__version__)

1.22.4


In [None]:
!pip install sparkxgb



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

# Defining local resources directory
local_resources_path = "/resources"
os.makedirs(local_resources_path, exist_ok=True)

# Defining the source paths from your mounted Google Drive
xgboost4j_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j_2.12-1.7.6.jar"
xgboost4j_spark_source = "/content/drive/MyDrive/Big Data Analytics - Project/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Defining the destination paths in the instance's local file system
xgboost4j_dest = os.path.join(local_resources_path, "xgboost4j_2.12-1.7.6.jar")
xgboost4j_spark_dest = os.path.join(local_resources_path, "xgboost4j-spark_2.12-1.7.6.jar")

# Copying the files from Google Drive to the local instance
shutil.copyfile(xgboost4j_source, xgboost4j_dest)
shutil.copyfile(xgboost4j_spark_source, xgboost4j_spark_dest)

# Verifying that the files are copied
print(f"Jar Files copied to: {local_resources_path}")
print(os.listdir(local_resources_path))


Jar Files copied to: /resources
['xgboost4j-spark_2.12-1.7.6.jar', 'xgboost4j_2.12-1.7.6.jar']


In [None]:
'''
from pyspark.sql import SparkSession
from pyspark import SparkConf
import os

def create_spark_session():
    # Get absolute paths to the jar files
    current_dir = os.getcwd()
    xgboost4j_path = os.path.abspath("/resources/xgboost4j_2.12-1.7.6.jar")
    xgboost4j_spark_path = os.path.abspath("/resources/xgboost4j-spark_2.12-1.7.6.jar")

    # Verify jar files exist
    if not os.path.exists(xgboost4j_path) or not os.path.exists(xgboost4j_spark_path):
        raise FileNotFoundError(f"XGBoost jar files not found in {current_dir}/resources/")

    # Create SparkConf with necessary configurations
    conf = SparkConf()
    conf.set("spark.jars", f"{xgboost4j_path},{xgboost4j_spark_path}")

    # Additional configurations
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.executor.memory", "4g")
    conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    conf.set("spark.driver.maxResultSize", "2g")

    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("XGBoost Example") \
        .config(conf=conf) \
        .getOrCreate()

    return spark

def load_xgboost_model(spark, model_path):
    try:
        # Import XGBoost classes only after Spark session is created
        from pyspark.ml.util import MLReader

        # Get the Java gateway
        gateway = spark.sparkContext._gateway
        jvm = gateway.jvm

        # Load the XGBoost classes through Java gateway
        XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

        # Create a Java object reader
        reader = XGBoostRegressionModel.read()

        # Load the model using the Java reader
        java_model = reader.load(model_path)

        # Wrap the Java model in a Python object
        from pyspark.ml.wrapper import JavaModel
        model = JavaModel(java_model)

        print(f"XGBoost model loaded successfully from {model_path}")
        return model

    except Exception as e:
        print(f"Detailed error loading XGBoost model: {str(e)}")
        print("\nTroubleshooting steps:")
        print("1. Check your PySpark version:")
        print(f"   Current PySpark version: {spark.version}")
        print("2. Verify XGBoost4J-Spark version matches:")
        print("   Required: xgboost4j-spark_2.12-1.7.6")
        print("3. Check jar files:")
        print(f"   Looking in: {os.getcwd()}/resources/")
        print("4. Verify model path exists:")
        print(f"   Path: {model_path}")
        print("5. Environment variables:")
        print(f"   SPARK_HOME: {os.environ.get('SPARK_HOME', 'Not set')}")
        print(f"   JAVA_HOME: {os.environ.get('JAVA_HOME', 'Not set')}")
        return None

def verify_model(model_path):
    """Verify the model file exists and is accessible"""
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model not found at: {model_path}")
    if not os.access(model_path, os.R_OK):
        raise PermissionError(f"No read permission for model at: {model_path}")
    return True

def main():
    try:
        # Initialize Spark
        spark = create_spark_session()

        # Model path
        xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

        # Verify model path
        verify_model(xgb_model_path)

        # Load model
        model = load_xgboost_model(spark, xgb_model_path)

        if model is not None:
            # Example usage
            print("Model successfully loaded and ready for predictions")
            return spark, model
        else:
            raise Exception("Model loading failed")

    except Exception as e:
        print(f"Setup failed: {str(e)}")
        return None, None

if __name__ == "__main__":
    spark, model = main()

'''

XGBoost model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE
Model successfully loaded and ready for predictions


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.wrapper import JavaModel

# Defining the path to the jar files
jar_files = "/resources/xgboost4j_2.12-1.7.6.jar,/resources/xgboost4j-spark_2.12-1.7.6.jar"

# Initialize Spark session with your configuration
spark = SparkSession.builder \
    .appName("BoostingModel") \
    .config("spark.driver.memory", "150g") \
    .config("spark.executor.memory", "150g") \
    .config("spark.driver.maxResultSize", "50g") \
    .config("spark.executor.memoryOverhead", "50g") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4 -XX:ParallelGCThreads=4") \
    .config("spark.jars", jar_files) \
    .getOrCreate()

# Verify Spark session
print(f"Spark session started with version: {spark.version}")


Spark session started with version: 3.1.1


In [None]:
from pyspark.ml.wrapper import JavaModel

# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"

try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    xgb_model = JavaModel(java_model)
    print(f"XGBoost model loaded successfully from {xgb_model_path}")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

XGBoost model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE


In [None]:
# Testing if sparkxgb is loaded properly
try:
    from sparkxgb import XGBoostRegressor

    model = XGBoostRegressor()
    print("sparkxgb loaded successfully!")
except Exception as e:
    print(f"Error loading sparkxgb: {e}")


sparkxgb loaded successfully!


In [None]:
!cp '/content/drive/MyDrive/Big Data Analytics - Project/Datasets/Feature_Engineered_DF.parquet' /content/

output_path = '/content/Feature_Engineered_DF.parquet'
df = spark.read.parquet(output_path)
print("The Feature Engineered DataFrame has been loaded successfully.")


The Feature Engineered DataFrame has been loaded successfully.


In [None]:
# Printing the shape of the DataFrame
total_rows = df.count()
total_columns = len(df.columns)

print(f"The shape of the loaded DataFrame is: ({total_rows}, {total_columns})")

The shape of the loaded DataFrame is: (3000040, 47)


In [None]:
# Calculating the average price
avg_price = df.agg({"price": "avg"}).collect()[0][0]
print(f"Average price of a car: {round(avg_price)}")

Average price of a car: 29933


In [None]:
import pandas as pd
from IPython.display import display
import pyspark.sql.functions as F

# Converting the Spark DataFrame to a Pandas DataFrame and displaying 5 random rows with all columns
pd.set_option('display.max_columns', None)
pandas_df = df.orderBy(F.rand()).limit(5).toPandas()
display(pandas_df)


Unnamed: 0,fuel_type,body_type,city,city_fuel_economy,days_in_market,dealer_zip,engine_displacement,engine_type,exterior_color,franchise_dealer,fuel_tank_volume,height,highway_fuel_economy,horsepower,interior_color,is_new,latitude,length,listing_color,longitude,make_name,maximum_seating,model_name,price,savings_amount,seller_rating,sp_name,torque,transmission,transmission_display,wheel_system_display,wheelbase,width,manufactured_year,combined_fuel_economy,legroom,log_mileage,major_options_count,hp_x_engine_disp,hp_x_torque,listed_day,listed_month,listed_year,age,resale_value_score,maintenance_cost,luxury_score
0,Gasoline,SUV / Crossover,Morrisville,16.0,33,19067,3500.0,V6,Black,False,18.6,71.0,23.0,290.0,Black,False,40.1964,198.3,BLACK,-74.807701,Ford,7.0,Explorer,24495.0,2297,4.512195,Grace Auto Group,255.0,A,Automatic,All-Wheel Drive,112.8,90.2,2017,19.5,82.4,11.07,22,0.21,-0.04534,8,8,2020,3,22,38,39
1,Gasoline,Sedan,Chillicothe,24.0,168,45601,2400.0,I4,White,True,18.5,57.7,32.0,185.0,Black,True,39.351002,191.1,WHITE,-82.972,Kia,5.0,Optima,25130.0,0,3.625,Herrnstein Chrysler Dodge Jeep Kia,178.0,A,6-Speed Automatic,Front-Wheel Drive,110.4,73.2,2020,28.0,81.1,8.91,4,0.29,0.5712,26,3,2020,0,22,37,31
2,Gasoline,SUV / Crossover,Omaha,16.0,1,68154,5300.0,V8,Blue,True,24.0,75.9,20.0,355.0,Black,True,41.264301,210.7,BLUE,-96.087196,Chevrolet,8.0,Tahoe,73355.0,0,5.0,Huber Chevrolet,460.0,A,Automatic,Four-Wheel Drive,120.9,81.0,2020,18.0,86.5,8.91,12,2.19,2.18888,9,9,2020,0,31,44,38
3,Gasoline,SUV / Crossover,Carrollton,14.0,10,75006,4700.0,V8,Silver,False,26.0,73.2,17.0,282.0,Other,False,32.977901,203.9,SILVER,-96.8451,Toyota,8.0,Sequoia,7995.0,0,4.338235,Dallas Autos Direct,325.0,A,Automatic,Rear-Wheel Drive,118.1,76.4,2005,15.5,80.0,11.55,1,0.52,0.2152,1,9,2020,15,17,31,28
4,Gasoline,Sedan,Rochester Hills,20.0,40,48307,2000.0,I4,Blue,True,18.0,58.1,29.0,245.0,Black,True,42.6381,191.7,BLUE,-83.131897,Ford,5.0,Fusion,20689.0,0,4.5,Serra Ford Rochester Hills,265.22,A,6-Speed Automatic,All-Wheel Drive,112.2,83.5,2020,24.5,82.6,1.1,8,0.02,0.0,31,7,2020,0,32,40,34




---



# **Boosting**

I have successfully implemented a traditional Gradient Boosting approach using multiple decision trees as weak learners, where each tree is trained to predict the residual errors of the ensemble built so far. This iterative process, with a specified number of trees, gradually improves the predictions by reducing the errors in each step.

Now, I am moving on to an alternative approach that combines XGBoost and GBT, where I will first use GBT to generate initial predictions and then leverage XGBoost to refine these predictions by focusing on the residual errors of the GBT model.

In this combined approach, I will first train the GBT Regressor on the data to generate initial predictions. Then calculate the residuals (the differences between the actual price and the predicted price from the GBT model). Afterward, use the residuals as the target for training the XGBoost model. This way, XGBoost will focus on minimizing the residual errors made by the GBT model.

In [None]:
from pyspark.ml.regression import GBTRegressor
from sparkxgb import XGBoostRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from tqdm import tqdm
import time
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

print("Processing the data...")

# Sample data
df_sample = df.sample(fraction=0.033, seed=42)  # 100k records of data for GBT training
cat_columns = [field for (field, dtype) in df_sample.dtypes if dtype == "string"]

# Preprocessing pipeline
stages = []
for col_name in cat_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed", handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=f"{col_name}_indexed", outputCol=f"{col_name}_encoded")
    stages += [indexer, encoder]

# Assemble features
num_columns = [col for col in df_sample.columns if col != 'price' and col not in cat_columns]
encoded_columns = [f"{col}_encoded" for col in cat_columns]
feature_columns = num_columns + encoded_columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
stages += [assembler]

# Scale features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
stages += [scaler]

# Apply the pipeline
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(df_sample)
df_sample = pipeline_model.transform(df_sample)

# Split data
train_df, test_df = df_sample.randomSplit([0.8, 0.2], seed=42)
print("\nData Processing completed !")

Processing the data...

Data Processing completed !


In [None]:
# Train GBT Regressor
print("Training GBT Regressor model...")
gbt_regressor = GBTRegressor(
    featuresCol="scaled_features",
    labelCol="price",
    maxIter=100,
    maxDepth=5,
    seed=42,
    stepSize=0.1,
    minInstancesPerNode=10,
    maxBins=50
)


gbt_model = gbt_regressor.fit(train_df)

# Save the trained GBT model to a path
new_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/DecisionTree_Boosting"
gbt_model.save(new_model_path)

print(f"Model saved successfully at {new_model_path}")

Training GBT Regressor model...
Model saved successfully at /content/drive/MyDrive/Big Data Analytics - Project/models/DecisionTree_Boosting


In [None]:
from pyspark.ml.regression import GBTRegressionModel
from sparkxgb import XGBoostRegressor
import pyspark.sql.functions as F
from pyspark.ml.evaluation import RegressionEvaluator
import gc

# Load the saved GBT model
new_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/DecisionTree_Boosting"
gbt_model = GBTRegressionModel.load(new_model_path)

# GBT predictions on the training data to compute residuals
train_predictions = gbt_model.transform(train_df).select("price", "prediction")
train_residuals = train_predictions.withColumn("residual", F.col("price") - F.col("prediction"))

# Add residuals to training data for XGBoost
train_df_xgb = train_df.join(train_residuals, on="price", how="inner").drop("prediction")
train_df_xgb = train_df_xgb.repartition(20)  # Reduced number of partitions

# Trigger garbage collection to free memory
gc.collect()

# Train XGBoost on residuals with optimized parameters
print("Training XGBoost model on GBT residuals...")

xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",
    labelCol="residual",
    maxDepth=6,
    eta=0.05,
    numRound=50,
    subsample=0.8,
    objective="reg:squarederror",
    treeMethod="hist"            # More memory-efficient method
)

xgb_model = xgb_regressor.fit(train_df_xgb)

# Unpersist and clear memory of train_df_xgb after fitting
train_df_xgb.unpersist()
gc.collect()

# Make predictions with GBT on test data
print("Making predictions with GBT model on test data...")
gbt_predictions_test = gbt_model.transform(test_df).select("price", "scaled_features", "prediction")
gbt_predictions_test = gbt_predictions_test.withColumnRenamed("prediction", "gbt_prediction")

# Make predictions with XGBoost on residuals
print("Refining predictions with XGBoost model...")
xgb_predictions = xgb_model.transform(gbt_predictions_test).select("prediction").withColumnRenamed("prediction", "xgb_prediction")

# Combine GBT and XGBoost predictions
final_predictions = gbt_predictions_test.join(xgb_predictions)
final_predictions = final_predictions.withColumn("final_prediction", F.col("gbt_prediction") + F.col("xgb_prediction"))

# Evaluate combined model
print("Evaluating the combined model...")
evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
r2 = evaluator.evaluate(final_predictions)
print(f"\nR-Squared Score (Accuracy): {r2 * 100:.2f}%")

# Additional metrics
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae")
mae = mae_evaluator.evaluate(final_predictions)

mse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mse")
mse = mse_evaluator.evaluate(final_predictions)

rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(final_predictions)

print("\nAdditional Metrics:")
print(f"Mean Absolute Error: {round(mae)}")
print(f"Mean Squared Error: {round(mse)}")
print(f"Root Mean Squared Error: {round(rmse)}")


Training XGBoost model on GBT residuals...


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o2203.fit

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42241)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.po

In [None]:
'''
NEW METRICS:
xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",
    labelCol="residual",
    maxDepth=6,                               # Reduced depth
    eta=0.05,
    numRound=100,                             # Reduced rounds
    objective="reg:squarederror",
    treeMethod="approx",                      # Approximate method to save memory
)

OLD METRICS:
xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",
    labelCol="residual",
    maxDepth=8,
    eta=0.05,
    numRound=200,
    objective="reg:squarederror",
    treeMethod="hist",
)

NEWEST METRICS:
xgb_regressor = XGBoostRegressor(
    featuresCol="scaled_features",
    labelCol="residual",
    maxDepth=6,                           # Reduced depth
    eta=0.05,
    numRound=100,                         # Reduced rounds
    subsample=0.8,                        # Sample 80% of data
    objective="reg:squarederror",
    treeMethod="approx",                  # Approximate method to save memory
)

'''



---



In [None]:
from pyspark.ml.regression import GBTRegressionModel
from sparkxgb import XGBoostRegressionModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F

# Load the saved GBT model
new_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/DecisionTree_Boosting"
gbt_model = GBTRegressionModel.load(new_model_path)

# Validate GBT Predictions
print("Making predictions with GBT model on test data...")
gbt_predictions_test = gbt_model.transform(test_df).select("price", "prediction")
gbt_predictions_test = gbt_predictions_test.withColumnRenamed("prediction", "gbt_prediction")

# Evaluate GBT model
evaluator_gbt = RegressionEvaluator(labelCol="price", predictionCol="gbt_prediction", metricName="r2")
r2_gbt = evaluator_gbt.evaluate(gbt_predictions_test)
print(f"GBT R-Squared Score: {r2_gbt * 100:.2f}%")

Making predictions with GBT model on test data...
GBT R-Squared Score: 84.76%


In [None]:
# Load model using JVM gateway
xgb_model_path = "/content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE"
try:
    # Access XGBoost through JVM
    jvm = spark.sparkContext._gateway.jvm
    XGBoostRegressionModel = jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel

    # Load the model
    java_model = XGBoostRegressionModel.read().load(xgb_model_path)
    xgb_model = JavaModel(java_model)
    print(f"XGBoost model loaded successfully from {xgb_model_path}")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# Instead of using gbt_predictions_test, use the original test_df
# that contains all the necessary features:
gbt_predictions_test_with_features = gbt_model.transform(test_df)
# This will produce a DataFrame with 'price', 'prediction' (from GBT),
# and all the original features

# Select the required features for the XGBoost model
# Renaming the GBT prediction column to avoid conflicts
xgb_input_df = gbt_predictions_test_with_features.select("price",
                                                         F.col("prediction").alias("gbt_prediction"),
                                                         "scaled_features")

# Make predictions with XGBoost
xgb_predictions = xgb_model.transform(xgb_input_df).select("price", "gbt_prediction", "prediction")
xgb_predictions = xgb_predictions.withColumnRenamed("prediction", "xgb_prediction")

# Evaluate XGBoost model
xgb_evaluator = RegressionEvaluator(labelCol="price", predictionCol="xgb_prediction", metricName="r2")
xgb_r2 = xgb_evaluator.evaluate(xgb_predictions)
print(f"XGBoost R-Squared Score: {xgb_r2 * 100:.2f}%")


XGBoost model loaded successfully from /content/drive/MyDrive/Big Data Analytics - Project/models/XGB_Regression_model_FE
XGBoost R-Squared Score: -71597.91%


In [None]:
# Combine GBT and XGBoost predictions with weights
weight_gbt = 0.6
weight_xgb = 0.4

final_predictions = gbt_predictions_test.join(xgb_predictions, on="price")
final_predictions = final_predictions.withColumn(
    "final_prediction",
    F.col("gbt_prediction") * weight_gbt + F.col("xgb_prediction") * weight_xgb
)

# Evaluate combined model
combined_evaluator = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="r2")
combined_r2 = combined_evaluator.evaluate(final_predictions)
print(f"Combined Model R-Squared Score: {combined_r2 * 100:.2f}%")

# Additional metrics
mae_combined = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mae").evaluate(final_predictions)
mse_combined = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="mse").evaluate(final_predictions)
rmse_combined = RegressionEvaluator(labelCol="price", predictionCol="final_prediction", metricName="rmse").evaluate(final_predictions)

print("\nCombined Model Additional Metrics:")
print(f"Mean Absolute Error: {round(mae_combined)}")
print(f"Mean Squared Error: {round(mse_combined)}")
print(f"Root Mean Squared Error: {round(rmse_combined)}")



---

