# Importing libraries and initializing spark session

### Creating spark session and reading data from previous pipeline

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import os

spark =( SparkSession.builder
    .appName("Models")
    .config("spark.sql.shuffle.partitions", 20)
    .getOrCreate())


train = "gs://bucket121024/pipeline2/train_data.parquet"
test = "gs://bucket121024/pipeline2/test_data.parquet"

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/21 21:27:11 INFO SparkEnv: Registering MapOutputTracker
24/12/21 21:27:11 INFO SparkEnv: Registering BlockManagerMaster
24/12/21 21:27:11 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/12/21 21:27:11 INFO SparkEnv: Registering OutputCommitCoordinator


# Reading citibike combined data

### Efficient Parallelism:

### 80 partitions for training data distribute the workload evenly across 20 cores.
### 40 partitions for testing data minimize task scheduling overhead.

In [2]:

train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.sample(withReplacement=False, fraction=0.6)
test_data = test_data.sample(withReplacement=False, fraction=0.6)
train_data = train_data.repartition(80)
test_data = test_data.repartition(40)

                                                                                

In [3]:
feature_cols = ["day_of_week", "month", "hour", "year", "temp", "humidity", 
                "precip",  "windspeed", "visibility", 
                 "is_weekend", "is_lockdown","startstationname_indexed", "humidity_is_weekend", "temp_is_weekend","hour_bucket_indexed", "rolling_avg_demand", "lag_demand_1" ]

* `VectorAssembler`
Combines multiple input features into a single vector column (features), required for ML models.
* `StandardScaler`
Standardizes the feature vector by removing the mean and scaling to unit variance.

In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


In [5]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import TrainValidationSplit

dt = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol="demand")

#pipeline
pipeline = Pipeline(stages=[assembler, scaler, dt])

### Hyperparameter Tuning with `TrainValidationSplit`

To optimize the performance of the Decision Tree Regressor, we implemented hyperparameter tuning using `TrainValidationSplit`.

#### **1. Define the Parameter Grid**
We specify the hyperparameters to tune:
- **`maxDepth`:** Maximum depth of the tree (values: 3, 5).
- **`minInstancesPerNode`:** Minimum number of samples per tree node (values: 1, 2).
- **`maxBins`:** Number of bins for continuous feature discretization (values: 16, 32).



In [6]:
train_data.cache()

DataFrame[date: date, day_of_week: int, month: int, hour: int, year: int, startstationname: string, demand: bigint, datetime: timestamp, temp: double, humidity: double, precip: double, windspeed: double, visibility: double, conditions: string, is_weekend: int, is_lockdown: int, conditions_indexed: double, startstationname_indexed: double, lag_demand_1: bigint, rolling_avg_demand: double, hour_bucket: string, hour_bucket_indexed: double, temp_is_weekend: double, humidity_is_weekend: double]

In [7]:
%%time

param_grid = (ParamGridBuilder()
              .addGrid(dt.maxDepth, [3, 5])  
              .addGrid(dt.minInstancesPerNode, [1, 2]) 
              .addGrid(dt.maxBins, [16])
              .build())
evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")

train_val_split = TrainValidationSplit(estimator=pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)


tv_model = train_val_split.fit(train_data)

best_model = tv_model.bestModel
predictions = best_model.transform(test_data)


rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

r2_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)

mae_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

mse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

# Print all metrics
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R2): {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")


24/12/21 19:41:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Root Mean Squared Error (RMSE): 2.3371475155200025
R-Squared (R2): 0.7243980320844088
Mean Absolute Error (MAE): 1.5108403626289202
Mean Squared Error (MSE): 5.462258509301319
CPU times: user 766 ms, sys: 218 ms, total: 983 ms
Wall time: 6min 11s


                                                                                

In [8]:
#predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_DT.parquet")

# Random Forest Regressor
## Hyperparameter Tuning with `ParamGridBuilder`

The `ParamGridBuilder` is used to define a grid of hyperparameters for tuning the Random Forest Regressor. In this case:

### Explanation of Parameters:
1. **`numTrees`**:
   - The number of decision trees in the Random Forest.
   - Tested values: `50`, `100`.

2. **`maxDepth`**:
   - The maximum depth of each tree, which controls the complexity of the model.
   - Tested values: `5`, `10`.

### Total Combinations:
The total combinations of hyperparameters are:
\[
\text{Total Combinations} = \text{len(numTrees)} \times \text{len(maxDepth)} = 2 \times 2 = 4
\]

Each combination is evaluated during training to select the best-performing model based on validation metrics.


In [9]:
%%time

from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol="scaledFeatures", labelCol="demand")

rf_pipeline = Pipeline(stages=[assembler, scaler, rf])

param_grid = (ParamGridBuilder()
              .addGrid(rf.numTrees, [50, 100])  # Tune number of trees
              .addGrid(rf.maxDepth, [5, 10])    # Tune maximum depth of trees
              .build())

evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")

train_val_split = TrainValidationSplit(estimator=rf_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)

rf_model = train_val_split.fit(train_data)

# Get the best model
best_rf_model = rf_model.bestModel

rf_predictions = best_rf_model.transform(test_data)

# Evaluate the model
rf_rmse = evaluator.evaluate(rf_predictions)  # RMSE
rf_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(rf_predictions)  # R2
rf_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(rf_predictions)  # MAE
rf_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(rf_predictions)  # MSE

# Print metrics
print(f"Random Forest Regressor - RMSE: {rf_rmse}")
print(f"Random Forest Regressor - R2: {rf_r2}")
print(f"Random Forest Regressor - MAE: {rf_mae}")
print(f"Random Forest Regressor - MSE: {rf_mse}")



24/12/21 19:52:27 WARN DAGScheduler: Broadcasting large task binary with size 1180.8 KiB
24/12/21 19:53:19 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/12/21 19:54:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/21 19:55:44 WARN DAGScheduler: Broadcasting large task binary with size 1231.1 KiB
24/12/21 19:55:47 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB
24/12/21 19:57:08 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/21 20:05:59 WARN DAGScheduler: Broadcasting large task binary with size 1170.3 KiB
24/12/21 20:07:33 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
24/12/21 20:09:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/12/21 20:11:52 WARN DAGScheduler: Broadcasting large task binary with size 1237.2 KiB
24/12/21 20:11:55 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB
24/12/21 20:14:27 WARN DAGScheduler: Broad

Random Forest Regressor - RMSE: 2.1155644320309652
Random Forest Regressor - R2: 0.7741799090215069
Random Forest Regressor - MAE: 1.3778026324541734
Random Forest Regressor - MSE: 4.475612866074501
CPU times: user 2.84 s, sys: 623 ms, total: 3.47 s
Wall time: 54min 7s


                                                                                

In [10]:
#rf_predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_RF.parquet")

- **`regParam`**: Regularization parameter (lambda) to prevent overfitting.
  - Values tested: `[0.1, 0.3]`
- **`elasticNetParam`**: Mixing parameter to balance L1 (Lasso) and L2 (Ridge) regularization.
  - Values tested: `[0.0, 0.5]`
  - `0.0`: Pure L2 (Ridge) regularization.
  - `0.5`: A mix of L1 and L2 regularization.

In [11]:
%%time

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="demand")

lr_pipeline = Pipeline(stages=[assembler, scaler, lr])


param_grid = (ParamGridBuilder()
              .addGrid(lr.regParam, [0.1, 0.3])
              .addGrid(lr.elasticNetParam, [0.0, 0.5])
              .build())


rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")


train_val_split = TrainValidationSplit(estimator=lr_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=rmse_evaluator,
                                       trainRatio=0.8)


lr_model = train_val_split.fit(train_data)

best_lr_model = lr_model.bestModel

lr_predictions = best_lr_model.transform(test_data)

lr_rmse = rmse_evaluator.evaluate(lr_predictions)  # RMSE
lr_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(lr_predictions)  # R²
lr_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(lr_predictions)  # MAE
lr_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(lr_predictions)  # MSE

# Print all evaluation metrics
print(f"Linear Regression - Best RMSE: {lr_rmse}")
print(f"Linear Regression - Best R2: {lr_r2}")
print(f"Linear Regression - Best MAE: {lr_mae}")
print(f"Linear Regression - Best MSE: {lr_mse}")




Linear Regression - Best RMSE: 2.440002208602954
Linear Regression - Best R2: 0.6996065192039672
Linear Regression - Best MAE: 1.6498225305436467
Linear Regression - Best MSE: 5.9536107779872935
CPU times: user 525 ms, sys: 161 ms, total: 685 ms
Wall time: 3min 31s


                                                                                

In [12]:
#lr_predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_LR.parquet")

### Hyperparameter Tuning for Gradient Boosted Trees (GBT)

To optimize the performance of the Gradient Boosted Trees (GBT) model, a minimal hyperparameter tuning process was implemented using the following key parameters:

1. **`maxDepth`**:
   - Controls the maximum depth of the trees.
   - A deeper tree can model more complex relationships but risks overfitting.
   - Values tested: `[3, 5]`.

2. **`maxIter`**:
   - Determines the number of boosting iterations.
   - More iterations allow the model to refine predictions but increase computation time.
   - Values tested: `[10, 20]`.

3. **`stepSize`**:
   - Represents the learning rate for gradient boosting.
   - Smaller step sizes make the model converge more slowly but can improve generalization.
   - Values tested: `[0.05, 0.1]`.

The chosen hyperparameter ranges strike a balance between computational efficiency and model performance, ensuring scalability for large datasets.


In [7]:
%%time

from pyspark.ml.regression import GBTRegressor

# Initialize GBT Regressor
gbt = GBTRegressor(featuresCol="scaledFeatures", labelCol="demand")

gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])

# hyperparameter grid for tuning
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [3, 5]) 
              .addGrid(gbt.maxIter, [10, 20])
              .addGrid(gbt.stepSize, [0.05, 0.1])
              .build())

evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
train_val_split = TrainValidationSplit(estimator=gbt_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)
gbt_model = train_val_split.fit(train_data)
best_gbt_model = gbt_model.bestModel

gbt_predictions = best_gbt_model.transform(test_data)

gbt_rmse = evaluator.evaluate(gbt_predictions)  # RMSE
gbt_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(gbt_predictions)  # R²
gbt_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(gbt_predictions)  # MAE
gbt_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(gbt_predictions)  # MSE

# Print evaluation metrics
print(f"GBT Regressor - Best RMSE: {gbt_rmse}")
print(f"GBT Regressor - Best R2: {gbt_r2}")
print(f"GBT Regressor - Best MAE: {gbt_mae}")
print(f"GBT Regressor - Best MSE: {gbt_mse}")


24/12/21 21:27:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

GBT Regressor - Best RMSE: 2.0771982051303164
GBT Regressor - Best R2: 0.7819135375634417
GBT Regressor - Best MAE: 1.3487031977958273
GBT Regressor - Best MSE: 4.314752383396607
CPU times: user 3.13 s, sys: 669 ms, total: 3.8 s
Wall time: 22min 29s


                                                                                