In [0]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Initialize Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("S3 Integration with Spark") \
    .config("spark.hadoop.fs.s3a.access.key", "Access_key") \
    .config("spark.hadoop.fs.s3a.secret.key", "access_secret_key") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [0]:
dbutils.fs.ls("/dbfs/FileStore/tables/taxi_final_df_cleaned/")

[FileInfo(path='dbfs:/dbfs/FileStore/tables/taxi_final_df_cleaned/_delta_log/', name='_delta_log/', size=0, modificationTime=1732678768304),
 FileInfo(path='dbfs:/dbfs/FileStore/tables/taxi_final_df_cleaned/part-00000-f7cfb25c-248e-4e7a-8318-878e63e93215.c000.snappy.parquet', name='part-00000-f7cfb25c-248e-4e7a-8318-878e63e93215.c000.snappy.parquet', size=13528168, modificationTime=1732631873000),
 FileInfo(path='dbfs:/dbfs/FileStore/tables/taxi_final_df_cleaned/part-00001-a3dd5f50-cb6c-45b9-912f-f00038c89bfd.c000.snappy.parquet', name='part-00001-a3dd5f50-cb6c-45b9-912f-f00038c89bfd.c000.snappy.parquet', size=17000175, modificationTime=1732631873000),
 FileInfo(path='dbfs:/dbfs/FileStore/tables/taxi_final_df_cleaned/part-00002-9397e5ba-9485-49a3-aa1c-ddd2268def29.c000.snappy.parquet', name='part-00002-9397e5ba-9485-49a3-aa1c-ddd2268def29.c000.snappy.parquet', size=17090344, modificationTime=1732631873000),
 FileInfo(path='dbfs:/dbfs/FileStore/tables/taxi_final_df_cleaned/part-00003-03

In [0]:
# Load the cleaned taxi data
taxi_df_cleaned = spark.read.format("delta").load("/dbfs/FileStore/tables/taxi_final_df_cleaned/")

In [0]:
from pyspark.sql.functions import col
taxi_df_cleaned = taxi_df_cleaned.withColumn("payment_type", col("payment_type").cast("int"))

In [0]:
display(taxi_df_cleaned.limit(5))
taxi_df_cleaned.printSchema()

passenger_count,payment_type,total_amount,trip_duration,pickup_day_of_week,pickup_hour,pickup_month,pickup_borough,dropoff_borough,is_holiday,distance_bin,time_of_day_bin,near_airport
3,2,14.7,9.866666793823242,2,9,10,Manhattan,Manhattan,1,Short,Morning,0
1,1,30.94,22.61666679382324,2,11,10,Manhattan,Manhattan,1,Short,Morning,0
1,1,26.85,23.13333320617676,2,12,10,Manhattan,Manhattan,1,Medium,Afternoon,0
1,1,21.88,12.333333015441896,2,13,10,Manhattan,Manhattan,1,Short,Afternoon,0
1,2,7.7,1.2999999523162842,2,16,10,Manhattan,Manhattan,1,Short,Afternoon,0


root
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- is_holiday: integer (nullable = true)
 |-- distance_bin: string (nullable = true)
 |-- time_of_day_bin: string (nullable = true)
 |-- near_airport: integer (nullable = true)



In [0]:
# Sample data to a smaller fraction to speed up processing
sample_data = taxi_df_cleaned.sample(fraction=0.1, seed=42).cache()
train_data, test_data = sample_data.randomSplit([0.8, 0.2], seed=42)

In [0]:
# Define categorical and numerical columns
categorical_cols = ['pickup_borough', 'dropoff_borough', 'distance_bin', 'time_of_day_bin']
numerical_cols = ['pickup_day_of_week','payment_type','trip_duration', 'pickup_hour', 'pickup_month','is_holiday','passenger_count', 'near_airport']
target_col = "total_amount"

## One_Hot_Encoding

In [0]:
# Define feature transformations
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]
encoders = [
    OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded", handleInvalid="error", dropLast=False) 
    for col in categorical_cols
]
assembler = VectorAssembler(inputCols=[col + "_encoded" for col in categorical_cols] + numerical_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [0]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
# Define models to evaluate
models = {
    'linear_reg': LinearRegression(featuresCol="scaledFeatures", labelCol=target_col),
    'decision_tree': DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'random_forest': RandomForestRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'gradient_boosting': GBTRegressor(featuresCol="scaledFeatures", labelCol=target_col)
}

In [0]:
# Evaluation metrics
r2_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")
mae_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="mae")

In [0]:
# Define K-fold cross-validation function with optimized settings
def cross_validate_model(model_name, model, train_data, test_data):
    # Pipeline creation
    pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, model])
    
    # Set up K-fold cross-validator with 3 folds
    param_grid = ParamGridBuilder().build()
    cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=r2_evaluator, numFolds=3, seed=42)
    
    # Fit cross-validator on training data
    cv_model = cross_val.fit(train_data)
    
    # Get R² score and calculate MAE on the test set
    predictions = cv_model.transform(test_data)
    r2_score = cv_model.avgMetrics[0]
    mae = mae_evaluator.evaluate(predictions)
    
    return (model_name, float(r2_score), float(mae))

# Evaluate models sequentially
results = []
for name, model in models.items():
    result = cross_validate_model(name, model, train_data, test_data)
    results.append(result)
    print(f"Model: {result[0]}, R²: {result[1]}, MAE: {result[2]}")

# Convert results to a DataFrame and display
schema = StructType([
    StructField("model_name", StringType(), True),
    StructField("r2_score", FloatType(), True),
    StructField("mae", FloatType(), True)
])
results_df = spark.createDataFrame(results, schema=schema)
results_df.show()

# Unpersist the cached data to free memory
sample_data.unpersist()

Model: linear_reg, R²: 0.8290222630088123, MAE: 4.006805069400227
Model: decision_tree, R²: 0.840119377749878, MAE: 4.15625552005428
Model: random_forest, R²: 0.8393684764711263, MAE: 3.953835751536056
Model: gradient_boosting, R²: 0.8666322263220215, MAE: 3.273234513119726
+-----------------+----------+---------+
|       model_name|  r2_score|      mae|
+-----------------+----------+---------+
|       linear_reg| 0.8290223| 4.006805|
|    decision_tree|0.84011936|4.1562557|
|    random_forest|0.83936846|3.9538357|
|gradient_boosting| 0.8666322|3.2732346|
+-----------------+----------+---------+



DataFrame[passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, pickup_borough: string, dropoff_borough: string, is_holiday: int, distance_bin: string, time_of_day_bin: string, near_airport: int]

## Ordinal_Encoding

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler

# Define feature transformations
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]
assembler = VectorAssembler(inputCols=[col + "_index" for col in categorical_cols] + numerical_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


In [0]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
# Define models to evaluate
models = {
    'linear_reg': LinearRegression(featuresCol="scaledFeatures", labelCol=target_col),
    'decision_tree': DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'random_forest': RandomForestRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'gradient_boosting': GBTRegressor(featuresCol="scaledFeatures", labelCol=target_col)
}

In [0]:
# Evaluation metrics
r2_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")
mae_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="mae")

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Define K-fold cross-validation function
def cross_validate_model(model_name, model, train_data, test_data):
    # Create pipeline with preprocessing and model
    pipeline = Pipeline(stages=indexers + [assembler, scaler, model])
    
    # Set up 3-fold cross-validation
    param_grid = ParamGridBuilder().build()  # No hyperparameter tuning
    cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=r2_evaluator, numFolds=3, seed=42)
    
    # Fit cross-validator on training data
    cv_model = cross_val.fit(train_data)
    
    # Evaluate model on the test set
    predictions = cv_model.transform(test_data)
    r2_score = cv_model.avgMetrics[0]
    mae = mae_evaluator.evaluate(predictions)
    
    return (model_name, float(r2_score), float(mae))

# Evaluate each model and capture results
results = []
for name, model in models.items():
    print(f"Starting cross-validation for model: {name}")
    result = cross_validate_model(name, model, train_data, test_data)
    results.append(result)
    print(f"Model: {result[0]}, R²: {result[1]}, MAE: {result[2]}")

# Convert results to a DataFrame and display
schema = StructType([
    StructField("model_name", StringType(), True),
    StructField("r2_score", FloatType(), True),
    StructField("mae", FloatType(), True)
])
results_df = spark.createDataFrame(results, schema=schema)
results_df.show()

# Unpersist the cached data to free memory
train_data.unpersist()
test_data.unpersist()

Starting cross-validation for model: linear_reg
Model: linear_reg, R²: 0.81458539050846, MAE: 4.340034381914302
Starting cross-validation for model: decision_tree
Model: decision_tree, R²: 0.8393737547555196, MAE: 4.167838610600192
Starting cross-validation for model: random_forest
Model: random_forest, R²: 0.8400047138865658, MAE: 4.050909483271984
Starting cross-validation for model: gradient_boosting
Model: gradient_boosting, R²: 0.8654929855700676, MAE: 3.315907733149578
+-----------------+----------+---------+
|       model_name|  r2_score|      mae|
+-----------------+----------+---------+
|       linear_reg| 0.8145854|4.3400345|
|    decision_tree|0.83937377|4.1678386|
|    random_forest|0.84000474|4.0509095|
|gradient_boosting|  0.865493|3.3159077|
+-----------------+----------+---------+



DataFrame[passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, pickup_borough: string, dropoff_borough: string, is_holiday: int, distance_bin: string, time_of_day_bin: string, near_airport: int]

## Target_Encoding

In [0]:
train_data.printSchema()
test_data.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- is_holiday: integer (nullable = true)
 |-- distance_bin: string (nullable = true)
 |-- time_of_day_bin: string (nullable = true)
 |-- near_airport: integer (nullable = true)

root
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string 

In [0]:
sample_data.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- is_holiday: integer (nullable = true)
 |-- distance_bin: string (nullable = true)
 |-- time_of_day_bin: string (nullable = true)
 |-- near_airport: integer (nullable = true)



In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# List of categorical columns for target mean encoding
categorical_cols = ['pickup_borough', 'dropoff_borough', 'distance_bin', 'time_of_day_bin']

# Target column
target_col = "total_amount"

# Dynamically build feature columns list by performing target mean encoding
feature_cols = []

# Add target mean encoding to train_data and test_data
for col in categorical_cols:
    # Calculate mean target for each category
    category_means = train_data.groupBy(col).agg(F.mean(target_col).alias(f"{col}_target_mean"))
    
    # Join the means to train and test datasets
    train_data = train_data.join(category_means, on=col, how="left")
    test_data = test_data.join(category_means, on=col, how="left")
    
    # Add the new encoded column to the list of feature columns
    feature_cols.append(f"{col}_target_mean")

numerical_cols = ['pickup_day_of_week','payment_type','trip_duration', 'pickup_hour', 'pickup_month','is_holiday','passenger_count', 'near_airport']
feature_cols.extend(numerical_cols)

# Cache the data for efficiency
train_data.cache()
test_data.cache()

DataFrame[time_of_day_bin: string, distance_bin: string, dropoff_borough: string, pickup_borough: string, passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, is_holiday: int, near_airport: int, pickup_borough_target_mean: double, dropoff_borough_target_mean: double, distance_bin_target_mean: double, time_of_day_bin_target_mean: double]

In [0]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [0]:
# Define models to evaluate
models = {
    'linear_reg': LinearRegression(featuresCol="scaledFeatures", labelCol=target_col),
    'decision_tree': DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'random_forest': RandomForestRegressor(featuresCol="scaledFeatures", labelCol=target_col),
    'gradient_boosting': GBTRegressor(featuresCol="scaledFeatures", labelCol=target_col)
}

In [0]:
# Define evaluation metrics
r2_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")
mae_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="mae")

In [0]:
# Cross-validation function
def cross_validate_model(model_name, model, train_data, test_data):
    # Create pipeline with assembler, scaler, and model
    pipeline = Pipeline(stages=[assembler, scaler, model])
    
    # Set up cross-validation
    param_grid = ParamGridBuilder().build()  # No hyperparameter tuning in this example
    cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=r2_evaluator, numFolds=3, seed=42)
    
    # Fit cross-validator on training data
    cv_model = cross_val.fit(train_data)
    
    # Evaluate model on test data
    predictions = cv_model.transform(test_data)
    r2_score = cv_model.avgMetrics[0]
    mae = mae_evaluator.evaluate(predictions)
    
    return (model_name, float(r2_score), float(mae))

# Evaluate all models
results = []
for name, model in models.items():
    print(f"Starting cross-validation for model: {name}")
    result = cross_validate_model(name, model, train_data, test_data)
    results.append(result)
    print(f"Model: {result[0]}, R²: {result[1]}, MAE: {result[2]}")

# Convert results to a DataFrame and display
schema = StructType([
    StructField("model_name", StringType(), True),
    StructField("r2_score", FloatType(), True),
    StructField("mae", FloatType(), True)
])
results_df = spark.createDataFrame(results, schema=schema)
results_df.show()

train_data.unpersist()
test_data.unpersist()

Starting cross-validation for model: linear_reg
Model: linear_reg, R²: 0.8254043745319345, MAE: 4.093160624739086
Starting cross-validation for model: decision_tree
Model: decision_tree, R²: 0.8401205134193733, MAE: 4.156255520054801
Starting cross-validation for model: random_forest
Model: random_forest, R²: 0.8416565814772712, MAE: 4.0155203402921895
Starting cross-validation for model: gradient_boosting
Model: gradient_boosting, R²: 0.8667986160797772, MAE: 3.262267981062468
+-----------------+----------+---------+
|       model_name|  r2_score|      mae|
+-----------------+----------+---------+
|       linear_reg|0.82540435|4.0931606|
|    decision_tree| 0.8401205|4.1562557|
|    random_forest|0.84165657|4.0155206|
|gradient_boosting|0.86679864| 3.262268|
+-----------------+----------+---------+



DataFrame[time_of_day_bin: string, distance_bin: string, dropoff_borough: string, pickup_borough: string, passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, is_holiday: int, near_airport: int, pickup_borough_target_mean: double, dropoff_borough_target_mean: double, distance_bin_target_mean: double, time_of_day_bin_target_mean: double]

## One_Hot_Encoding_with_PCA

In [0]:
# Define feature transformations
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]
encoders = [
    OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded", handleInvalid="error", dropLast=False) 
    for col in categorical_cols
]
assembler = VectorAssembler(inputCols=[col + "_encoded" for col in categorical_cols] + numerical_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, PCA

# Apply PCA after scaling
pca = PCA(k=10, inputCol="scaledFeatures", outputCol="pcaFeatures")  # Adjust k to the desired number of principal components

In [0]:
# Define models to evaluate, using `pcaFeatures` as input
models = {
    'linear_reg': LinearRegression(featuresCol="pcaFeatures", labelCol=target_col),
    'decision_tree': DecisionTreeRegressor(featuresCol="pcaFeatures", labelCol=target_col),
    'random_forest': RandomForestRegressor(featuresCol="pcaFeatures", labelCol=target_col),
    'gradient_boosting': GBTRegressor(featuresCol="pcaFeatures", labelCol=target_col)
}

In [0]:
# Evaluation metrics
r2_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")
mae_evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="mae")

In [0]:
# Define K-fold cross-validation function
def cross_validate_model(model_name, model, train_data, test_data):
    # Pipeline creation with One-Hot Encoding, PCA, and model
    pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, pca, model])
    
    # Set up 3-fold cross-validation
    param_grid = ParamGridBuilder().build()  # No hyperparameter tuning
    cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=r2_evaluator, numFolds=3, seed=42)
    
    # Fit cross-validator on training data
    cv_model = cross_val.fit(train_data)
    
    # Evaluate model on the test set
    predictions = cv_model.transform(test_data)
    r2_score = cv_model.avgMetrics[0]
    mae = mae_evaluator.evaluate(predictions)
    
    return (model_name, float(r2_score), float(mae))

# Evaluate each model and capture results
results = []
for name, model in models.items():
    print(f"Starting cross-validation for model: {name}")
    result = cross_validate_model(name, model, train_data, test_data)
    results.append(result)
    print(f"Model: {result[0]}, R²: {result[1]}, MAE: {result[2]}")

# Convert results to a DataFrame and display
schema = StructType([
    StructField("model_name", StringType(), True),
    StructField("r2_score", FloatType(), True),
    StructField("mae", FloatType(), True)
])
results_df = spark.createDataFrame(results, schema=schema)
results_df.show()

# Unpersist the cached data to free memory
train_data.unpersist()
test_data.unpersist()

Starting cross-validation for model: linear_reg
Model: linear_reg, R²: 0.7267091456133968, MAE: 5.4649227481979885
Starting cross-validation for model: decision_tree
Model: decision_tree, R²: 0.7985795569206094, MAE: 4.454893242366173
Starting cross-validation for model: random_forest
Model: random_forest, R²: 0.8015923178604, MAE: 4.475081132878776
Starting cross-validation for model: gradient_boosting
Model: gradient_boosting, R²: 0.841954093304136, MAE: 3.7819059343806676
+-----------------+---------+--------+
|       model_name| r2_score|     mae|
+-----------------+---------+--------+
|       linear_reg|0.7267091|5.464923|
|    decision_tree|0.7985796|4.454893|
|    random_forest|0.8015923|4.475081|
|gradient_boosting|0.8419541|3.781906|
+-----------------+---------+--------+



DataFrame[time_of_day_bin: string, distance_bin: string, dropoff_borough: string, pickup_borough: string, passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, is_holiday: int, near_airport: int, pickup_borough_target_mean: double, dropoff_borough_target_mean: double, distance_bin_target_mean: double, time_of_day_bin_target_mean: double]

## Hyperparameter Tuning

In [0]:
from pyspark.ml.regression import LinearRegression

# Define the Linear Regression model
linear_reg_model = LinearRegression(featuresCol="scaledFeatures", labelCol=target_col)

# For Linear Regression
print("Processing Linear Regression...")
linear_reg_pipeline = Pipeline(stages=indexers + encoders + 
                               [assembler.setOutputCol("features"),
                                scaler.setInputCol("features").setOutputCol("scaledFeatures"), 
                                linear_reg_model])

# Define parameter grid for Linear Regression
param_grid_lr = ParamGridBuilder() \
    .addGrid(linear_reg_model.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(linear_reg_model.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Cross-validation setup
cv_lr = CrossValidator(
    estimator=linear_reg_pipeline,
    estimatorParamMaps=param_grid_lr,
    evaluator=r2_evaluator,
    numFolds=3,
    seed=42
)

# Fit the cross-validation model
cv_lr_model = cv_lr.fit(train_data)

# Get the best model
lr_best_model = cv_lr_model.bestModel

# Make predictions on the test data
lr_predictions = lr_best_model.transform(test_data)

# Evaluate the model
lr_r2 = r2_evaluator.evaluate(lr_predictions)
lr_mae = mae_evaluator.evaluate(lr_predictions)

# Append results
results.append(("Linear Regression", lr_r2, lr_mae))


Processing Linear Regression...


In [0]:
# Access the best Linear Regression model from the pipeline
best_lr_model = lr_best_model.stages[-1]

# Retrieve the best parameters for Linear Regression
best_reg_param_lr = best_lr_model.getRegParam()
best_elastic_net_param_lr = best_lr_model.getElasticNetParam()

# Print the best Linear Regression parameters
print(f"Best Linear Regression regParam: {best_reg_param_lr}")
print(f"Best Linear Regression elasticNetParam: {best_elastic_net_param_lr}")


Best Linear Regression regParam: 0.001
Best Linear Regression elasticNetParam: 0.0


In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

# Define the Decision Tree model
decision_tree_model = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol=target_col)

# For Decision Tree
print("Processing Decision Tree...")

# Create a pipeline specifically for Decision Tree without PCA
decision_tree_pipeline = Pipeline(stages=indexers + encoders + [
    assembler.setOutputCol("features"),
    scaler.setInputCol("features").setOutputCol("scaledFeatures"),
    decision_tree_model
])

# Define parameter grid for Decision Tree
param_grid_dt = ParamGridBuilder() \
    .addGrid(decision_tree_model.maxDepth, [5, 10, 20]) \
    .addGrid(decision_tree_model.maxBins, [32, 64]) \
    .build()

# Set up CrossValidator for Decision Tree
cv_dt = CrossValidator(
    estimator=decision_tree_pipeline,
    estimatorParamMaps=param_grid_dt,
    evaluator=r2_evaluator,
    numFolds=2,
    seed=42
)

# Train the model using CrossValidator
cv_dt_model = cv_dt.fit(train_data)

# Retrieve the best model from cross-validation
dt_best_model = cv_dt_model.bestModel

# Make predictions on the test data
dt_predictions = dt_best_model.transform(test_data)

# Evaluate the model using R² and MAE
dt_r2 = r2_evaluator.evaluate(dt_predictions)
dt_mae = mae_evaluator.evaluate(dt_predictions)

# Append results to the list
results.append(("Decision Tree", dt_r2, dt_mae))

# Print the results for the Decision Tree model
print(f"Decision Tree - R²: {dt_r2:.4f}, MAE: {dt_mae:.4f}")

# Unpersist the data after use
train_data.unpersist()
test_data.unpersist()

Processing Decision Tree...
Decision Tree - R²: 0.8744, MAE: 3.0984


DataFrame[time_of_day_bin: string, distance_bin: string, dropoff_borough: string, pickup_borough: string, passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, is_holiday: int, near_airport: int, pickup_borough_target_mean: double, dropoff_borough_target_mean: double, distance_bin_target_mean: double, time_of_day_bin_target_mean: double]

In [0]:
# Access the best hyperparameters from the cross-validation
best_max_depth = dt_best_model.stages[-1]._java_obj.getMaxDepth()
best_max_bins = dt_best_model.stages[-1]._java_obj.getMaxBins()

# Print the best hyperparameters
print(f"Best maxDepth: {best_max_depth}")
print(f"Best maxBins: {best_max_bins}")

Best maxDepth: 10
Best maxBins: 64


In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml import Pipeline

# Define the Random Forest Regressor model
random_forest_model = RandomForestRegressor(featuresCol="scaledFeatures", labelCol=target_col, seed=42)

print("Processing Random Forest Regressor...")

# Create a pipeline specifically for Random Forest without PCA
random_forest_pipeline = Pipeline(stages=indexers + encoders + [
    assembler.setOutputCol("features"),
    scaler.setInputCol("features").setOutputCol("scaledFeatures"),
    random_forest_model 
])

# Define a simplified parameter grid for Random Forest
param_grid_rf = ParamGridBuilder() \
    .addGrid(random_forest_model.maxDepth, [5, 10]) \
    .addGrid(random_forest_model.numTrees, [10, 25]) \
    .build()

# Set up TrainValidationSplit
train_val_rf = TrainValidationSplit(
    estimator=random_forest_pipeline,
    estimatorParamMaps=param_grid_rf,
    evaluator=r2_evaluator,
    trainRatio=0.8,
    seed=42
)

# Train the model using TrainValidationSplit
cv_rf_model = train_val_rf.fit(train_data)

# Retrieve the best model from TrainValidationSplit
rf_best_model = cv_rf_model.bestModel

# Make predictions on the test data
rf_predictions = rf_best_model.transform(test_data)

# Evaluate the model using R² and MAE
rf_r2 = r2_evaluator.evaluate(rf_predictions)
rf_mae = mae_evaluator.evaluate(rf_predictions)

# Print the results for the Random Forest Regressor model
print(f"Random Forest Regressor - R²: {rf_r2:.4f}, MAE: {rf_mae:.4f}")

# Append results to the list
results.append(("Random Forest Regressor", rf_r2, rf_mae))

# Unpersist the data after use
train_data.unpersist()
test_data.unpersist()

Processing Random Forest Regressor...
Random Forest Regressor - R²: 0.8760, MAE: 3.1299


DataFrame[passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, pickup_borough: string, dropoff_borough: string, is_holiday: int, distance_bin: string, time_of_day_bin: string, near_airport: int]

In [0]:
import os

# Define the model save path in Databricks' file system
local_path = "/dbfs/tmp/random_forest_model"

# Save the best Random Forest model
rf_best_model.write().overwrite().save(local_path)

print(f"Model saved locally at {local_path}")

Model saved locally at /dbfs/tmp/random_forest_model


In [0]:
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

# Example single query input data
single_query = [
    (1, 1, 20.37, 2, 8, 10, "Manhattan", "Brooklyn", 0, "Medium", "Morning", 0)
]

# Define the schema of your input data based on the model's feature columns
columns = [
    "passenger_count", "payment_type", "trip_duration", 
    "pickup_day_of_week", "pickup_hour", "pickup_month", "pickup_borough", 
    "dropoff_borough", "is_holiday", "distance_bin", "time_of_day_bin", "near_airport"
]

# Initialize SparkSession
spark = SparkSession.builder.appName("ModelPrediction").getOrCreate()

# Convert the input query to a DataFrame
test_data_df = spark.createDataFrame(single_query, columns)

# Show the test data to verify the schema
print("Test Data:")
test_data_df.show()

Test Data:
+---------------+------------+-------------+------------------+-----------+------------+--------------+---------------+----------+------------+---------------+------------+
|passenger_count|payment_type|trip_duration|pickup_day_of_week|pickup_hour|pickup_month|pickup_borough|dropoff_borough|is_holiday|distance_bin|time_of_day_bin|near_airport|
+---------------+------------+-------------+------------------+-----------+------------+--------------+---------------+----------+------------+---------------+------------+
|              1|           1|        20.37|                 2|          8|          10|     Manhattan|       Brooklyn|         0|      Medium|        Morning|           0|
+---------------+------------+-------------+------------------+-----------+------------+--------------+---------------+----------+------------+---------------+------------+



In [0]:
# Access the best Random Forest model from the pipeline
best_rf_model = rf_best_model.stages[-1]

# Retrieve the best parameters for Random Forest
best_max_depth_rf = best_rf_model.getMaxDepth()  # Correct usage: No parentheses
best_num_trees_rf = best_rf_model.getNumTrees

# Print the best Random Forest parameters
print(f"Best Random Forest maxDepth: {best_max_depth_rf}")
print(f"Best Random Forest numTrees: {best_num_trees_rf}")
print(f"Best Random Forest: {best_rf_model}")

In [0]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Define the Gradient Boosted Tree Regressor model
gradient_boosting_model = GBTRegressor(featuresCol="scaledFeatures", labelCol=target_col, seed=42)

print("Processing Gradient Boosted Tree Regressor...")

# Cache train and test data
train_data.cache()
test_data.cache()

# Create a pipeline specifically for GBT
gradient_boosting_pipeline = Pipeline(stages=indexers + encoders + [
    assembler.setOutputCol("features"),
    scaler.setInputCol("features").setOutputCol("scaledFeatures"),
    gradient_boosting_model
])

# Define a simplified parameter grid for GBT
param_grid_gbt = ParamGridBuilder() \
    .addGrid(gradient_boosting_model.maxIter, [10, 25]) \
    .addGrid(gradient_boosting_model.maxDepth, [3, 5]) \
    .build()

# Set up TrainValidationSplit
train_val_gbt = TrainValidationSplit(
    estimator=gradient_boosting_pipeline,
    estimatorParamMaps=param_grid_gbt,
    evaluator=r2_evaluator,  # R² evaluator
    trainRatio=0.8,
    seed=42
)

# Train the model using TrainValidationSplit
cv_gbt_model = train_val_gbt.fit(train_data)

# Retrieve the best model from TrainValidationSplit
gbt_best_model = cv_gbt_model.bestModel

# Make predictions on the test data
gbt_predictions = gbt_best_model.transform(test_data)

# Evaluate the model using R² and MAE
gbt_r2 = r2_evaluator.evaluate(gbt_predictions)
gbt_mae = mae_evaluator.evaluate(gbt_predictions)

# Print the results for the Gradient Boosted Tree Regressor model
print(f"Gradient Boosted Tree Regressor - R²: {gbt_r2:.4f}, MAE: {gbt_mae:.4f}")

# Append results to the list (optional, if you need to store the results)
results.append(("Gradient Boosted Tree Regressor", gbt_r2, gbt_mae))

# Unpersist the data after use
train_data.unpersist()
test_data.unpersist()

Processing Gradient Boosted Tree Regressor...
Gradient Boosted Tree Regressor - R²: 0.8723, MAE: 3.2086


DataFrame[passenger_count: int, payment_type: int, total_amount: float, trip_duration: double, pickup_day_of_week: int, pickup_hour: int, pickup_month: int, pickup_borough: string, dropoff_borough: string, is_holiday: int, distance_bin: string, time_of_day_bin: string, near_airport: int]

In [0]:
# Access the best Gradient Boosted Tree model from the pipeline
best_gbt_model = cv_gbt_model.bestModel.stages[-1]

# Retrieve the best parameters for Gradient Boosted Tree
best_max_iter_gbt = best_gbt_model.getMaxIter()
best_max_depth_gbt = best_gbt_model.getMaxDepth()

# Print the best Gradient Boosted Tree parameters
print(f"Best Gradient Boosted Tree maxIter: {best_max_iter_gbt}")
print(f"Best Gradient Boosted Tree maxDepth: {best_max_depth_gbt}")

In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Create a dictionary to store the best parameters for each model
best_params = {
    "Linear Regression": {
        "regParam": best_reg_param_lr,
        "elasticNetParam": best_elastic_net_param_lr
    },
    "Decision Tree": {
        "maxDepth": best_max_depth,
        "maxBins": best_max_bins
    },
    "Random Forest": {
        "maxDepth": best_max_depth_rf,
        "numTrees": best_num_trees_rf
    },
    "Gradient Boosted Tree": {
        "maxIter": best_max_iter_gbt,
        "maxDepth": best_max_depth_gbt
    }
}

# Store the evaluation results in a structured format
results = [
    Row(model_name="Linear Regression", hyperparameters=best_params["Linear Regression"], r2_score=lr_r2, mae=lr_mae),
    Row(model_name="Decision Tree", hyperparameters=best_params["Decision Tree"], r2_score=dt_r2, mae=dt_mae),
    Row(model_name="Random Forest", hyperparameters=best_params["Random Forest"], r2_score=rf_r2, mae=rf_mae),
    Row(model_name="Gradient Boosted Tree", hyperparameters=best_params["Gradient Boosted Tree"], r2_score=gbt_r2, mae=gbt_mae)
]

# Define the schema for the DataFrame
schema = StructType([
    StructField("model_name", StringType(), True),
    StructField("hyperparameters", StringType(), True),
    StructField("r2_score", FloatType(), True),
    StructField("mae", FloatType(), True)
])

# Convert the results list into a DataFrame
results_df = spark.createDataFrame(results, schema=schema)

# Show the results DataFrame
results_df.show(truncate=False)

# Unpersist cached data
train_data.unpersist()
test_data.unpersist()

In [0]:
import matplotlib.pyplot as plt

# Example dictionary of models and their evaluation metrics
models_results = {
    "Linear Regression": {"r2": lr_r2, "mae": lr_mae},
    "Decision Tree": {"r2": dt_r2, "mae": dt_mae},
    "Random Forest": {"r2": rf_r2, "mae": rf_mae},
    "Gradient Boosted Tree": {"r2": gbt_r2, "mae": gbt_mae}
}

# Extract model names, R² scores, and MAE scores from the dictionary
models = list(models_results.keys())
r2_scores = [models_results[model]["r2"] for model in models]
mae_scores = [models_results[model]["mae"] for model in models]

# Create a figure with subplots (side-by-side layout)
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Plot R² scores
ax[0].bar(models, r2_scores, color='green')
ax[0].set_title("R² Scores")
ax[0].set_ylabel("R²")
ax[0].grid(True, axis='y', linestyle='--', alpha=0.7)  # Add gridlines for better readability

# Plot MAE scores
ax[1].bar(models, mae_scores, color='red')
ax[1].set_title("MAE Scores")
ax[1].set_ylabel("MAE")
ax[1].grid(True, axis='y', linestyle='--', alpha=0.7)  # Add gridlines for better readability

# Adjust layout to prevent overlap and improve spacing
plt.tight_layout()

# Show the plots
plt.show()