# Importing libraries and initializing spark session

### Creating spark session and reading data from previous pipeline

In [20]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import os

spark =( SparkSession.builder
    .appName("Models")
    .config("spark.executor.instances","3")
    .config("spark.executor.memory","8g")
    .config("spark.sql.shuffle.partitions", 20)
    .getOrCreate())


train = "gs://newbucketforabhishek/pipeline2_train_data.parquet_part-00000-cae65e05-711e-4164-b976-9bbecbe426e8-c000.snappy.parquet"
test = "gs://newbucketforabhishek/pipeline2_test_data.parquet_part-00000-cee56338-7d78-490c-93da-5eb185bd7b3e-c000.snappy.parquet"

# Reading citibike combined data

### Efficient Parallelism:

### 80 partitions for training data distribute the workload evenly across 20 cores.
### 40 partitions for testing data minimize task scheduling overhead.

In [24]:

train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.repartition(20)
test_data = test_data.repartition(20)

In [10]:
feature_cols = ["day_of_week", "month", "hour", "year", "temp", "humidity", 
                "precip",  "windspeed", "visibility", 
                 "is_weekend", "is_lockdown","startstationname_indexed", "humidity_is_weekend", "temp_is_weekend","hour_bucket_indexed", "rolling_avg_demand", "lag_demand_1" ]



* `VectorAssembler`
Combines multiple input features into a single vector column (features), required for ML models.
* `StandardScaler`
Standardizes the feature vector by removing the mean and scaling to unit variance.

In [11]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


In [12]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import TrainValidationSplit

dt = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol="demand")

#pipeline
pipeline = Pipeline(stages=[assembler, scaler, dt])



### Hyperparameter Tuning with `TrainValidationSplit`

To optimize the performance of the Decision Tree Regressor, we implemented hyperparameter tuning using `TrainValidationSplit`.

#### **1. Define the Parameter Grid**
We specify the hyperparameters to tune:
- **`maxDepth`:** Maximum depth of the tree (values: 3, 5).
- **`minInstancesPerNode`:** Minimum number of samples per tree node (values: 1, 2).
- **`maxBins`:** Number of bins for continuous feature discretization (values: 16, 32).



In [13]:
train_data.cache()

24/12/21 19:58:47 WARN CacheManager: Asked to cache already cached data.


DataFrame[date: date, day_of_week: int, month: int, hour: int, year: int, startstationname: string, demand: bigint, datetime: timestamp, temp: double, humidity: double, precip: double, windspeed: double, visibility: double, conditions: string, is_weekend: int, is_lockdown: int, conditions_indexed: double, startstationname_indexed: double, lag_demand_1: bigint, rolling_avg_demand: double, hour_bucket: string, hour_bucket_indexed: double, temp_is_weekend: double, humidity_is_weekend: double]

In [15]:
%%time

param_grid = (ParamGridBuilder()
              .addGrid(dt.maxDepth, [3, 5])  
              .addGrid(dt.minInstancesPerNode, [1, 2]) 
              .addGrid(dt.maxBins, [16])
              .build())
evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")

train_val_split = TrainValidationSplit(estimator=pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)


# Fit the model
tv_model = train_val_split.fit(train_data)

# Best model and predictions
best_model = tv_model.bestModel
predictions = best_model.transform(test_data)


# Evaluate RMSE
rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Evaluate R2
r2_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)

# # Evaluate Mean Absolute Error (MAE)
mae_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

# Evaluate Mean Squared Error (MSE)
mse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

# Print all metrics
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R2): {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")




Root Mean Squared Error (RMSE): 2.3258849854856116
R-Squared (R2): 0.726728053874677
Mean Absolute Error (MAE): 1.5019751713865255
Mean Squared Error (MSE): 5.409740965707404
CPU times: user 711 ms, sys: 158 ms, total: 869 ms
Wall time: 9min 8s


                                                                                

In [21]:

train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.repartition(20)
test_data = test_data.repartition(20)
train_data = train_data.sample(withReplacement=False, fraction=0.8)
test_data = test_data.sample(withReplacement=False, fraction=0.8)


In [22]:
%%time

param_grid = (ParamGridBuilder()
              .addGrid(dt.maxDepth, [3, 5])  
              .addGrid(dt.minInstancesPerNode, [1, 2]) 
              .addGrid(dt.maxBins, [16])
              .build())
evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
train_val_split = TrainValidationSplit(estimator=pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)


# Fit the model
tv_model = train_val_split.fit(train_data)

# Best model and predictions
best_model = tv_model.bestModel
predictions = best_model.transform(test_data)


# Evaluate RMSE
rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Evaluate R2
r2_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)

# # Evaluate Mean Absolute Error (MAE)
mae_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

# Evaluate Mean Squared Error (MSE)
mse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

# Print all metrics
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R2): {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")



Root Mean Squared Error (RMSE): 2.3176595142714795
R-Squared (R2): 0.7284985001214339
Mean Absolute Error (MAE): 1.4939340854071734
Mean Squared Error (MSE): 5.37154562409311
CPU times: user 658 ms, sys: 149 ms, total: 808 ms
Wall time: 8min 1s


                                                                                

In [9]:
#predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_DT.parquet")

                                                                                

# Random Forest Regressor
## Hyperparameter Tuning with `ParamGridBuilder`

The `ParamGridBuilder` is used to define a grid of hyperparameters for tuning the Random Forest Regressor. In this case:

### Explanation of Parameters:
1. **`numTrees`**:
   - The number of decision trees in the Random Forest.
   - Tested values: `50`, `100`.

2. **`maxDepth`**:
   - The maximum depth of each tree, which controls the complexity of the model.
   - Tested values: `5`, `10`.

### Total Combinations:
The total combinations of hyperparameters are:
\[
\text{Total Combinations} = \text{len(numTrees)} \times \text{len(maxDepth)} = 2 \times 2 = 4
\]

Each combination is evaluated during training to select the best-performing model based on validation metrics.


In [23]:
%%time

from pyspark.ml.regression import RandomForestRegressor

# Initialize Random Forest Regressor
rf = RandomForestRegressor(featuresCol="scaledFeatures", labelCol="demand")

# Create a pipeline
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])

# Define hyperparameter grid
param_grid = (ParamGridBuilder()
              .addGrid(rf.numTrees, [50, 100])  # Tune number of trees
              .addGrid(rf.maxDepth, [5, 10])    # Tune maximum depth of trees
              .build())

# Define evaluator
evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")

# TrainValidationSplit for tuning
train_val_split = TrainValidationSplit(estimator=rf_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)

# Train the model with hyperparameter tuning
rf_model = train_val_split.fit(train_data)

# Get the best model
best_rf_model = rf_model.bestModel

# Make predictions on the test data
rf_predictions = best_rf_model.transform(test_data)

# Evaluate the model
rf_rmse = evaluator.evaluate(rf_predictions)  # RMSE
rf_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(rf_predictions)  # R2
rf_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(rf_predictions)  # MAE
rf_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(rf_predictions)  # MSE

# Print metrics
print(f"Random Forest Regressor - RMSE: {rf_rmse}")
print(f"Random Forest Regressor - R2: {rf_r2}")
print(f"Random Forest Regressor - MAE: {rf_mae}")
print(f"Random Forest Regressor - MSE: {rf_mse}")



24/12/21 21:03:23 WARN DAGScheduler: Broadcasting large task binary with size 1239.5 KiB
24/12/21 21:05:20 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/12/21 21:07:28 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/21 21:10:28 WARN DAGScheduler: Broadcasting large task binary with size 1230.9 KiB
24/12/21 21:10:30 WARN DAGScheduler: Broadcasting large task binary with size 8.0 MiB
24/12/21 21:13:50 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/12/21 21:33:43 WARN DAGScheduler: Broadcasting large task binary with size 1228.3 KiB
24/12/21 21:37:28 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/12/21 21:41:48 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/12/21 21:47:30 WARN DAGScheduler: Broadcasting large task binary with size 1237.3 KiB
24/12/21 21:47:32 WARN DAGScheduler: Broadcasting large task binary with size 8.0 MiB
24/12/21 21:54:27 WARN DAGScheduler: Broad

Random Forest Regressor - RMSE: 2.1096479372849815
Random Forest Regressor - R2: 0.7750464066061753
Random Forest Regressor - MAE: 1.3746830139552535
Random Forest Regressor - MSE: 4.450614419290777
CPU times: user 2.17 s, sys: 523 ms, total: 2.69 s
Wall time: 2h 3min 55s


                                                                                

In [12]:
rf_predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_RF.parquet")

                                                                                

- **`regParam`**: Regularization parameter (lambda) to prevent overfitting.
  - Values tested: `[0.1, 0.3]`
- **`elasticNetParam`**: Mixing parameter to balance L1 (Lasso) and L2 (Ridge) regularization.
  - Values tested: `[0.0, 0.5]`
  - `0.0`: Pure L2 (Ridge) regularization.
  - `0.5`: A mix of L1 and L2 regularization.

In [25]:
%%time

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="demand")

lr_pipeline = Pipeline(stages=[assembler, scaler, lr])


param_grid = (ParamGridBuilder()
              .addGrid(lr.regParam, [0.1, 0.3])
              .addGrid(lr.elasticNetParam, [0.0, 0.5])
              .build())


rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")


train_val_split = TrainValidationSplit(estimator=lr_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=rmse_evaluator,
                                       trainRatio=0.8)


lr_model = train_val_split.fit(train_data)

best_lr_model = lr_model.bestModel

lr_predictions = best_lr_model.transform(test_data)

lr_rmse = rmse_evaluator.evaluate(lr_predictions)  # RMSE
lr_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(lr_predictions)  # R²
lr_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(lr_predictions)  # MAE
lr_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(lr_predictions)  # MSE

# Print all evaluation metrics
print(f"Linear Regression - Best RMSE: {lr_rmse}")
print(f"Linear Regression - Best R2: {lr_r2}")
print(f"Linear Regression - Best MAE: {lr_mae}")
print(f"Linear Regression - Best MSE: {lr_mse}")




Linear Regression - Best RMSE: 2.4394544559764983
Linear Regression - Best R2: 0.6993895954480251
Linear Regression - Best MAE: 1.6496203030622447
Linear Regression - Best MSE: 5.950938042783593
CPU times: user 540 ms, sys: 111 ms, total: 652 ms
Wall time: 7min 9s


                                                                                

In [26]:

train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.repartition(20)
test_data = test_data.repartition(20)
train_data = train_data.sample(withReplacement=False, fraction=0.8)
test_data = test_data.sample(withReplacement=False, fraction=0.8)


In [27]:
%%time

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="demand")

lr_pipeline = Pipeline(stages=[assembler, scaler, lr])


param_grid = (ParamGridBuilder()
              .addGrid(lr.regParam, [0.1, 0.3])
              .addGrid(lr.elasticNetParam, [0.0, 0.5])
              .build())


rmse_evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")


train_val_split = TrainValidationSplit(estimator=lr_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=rmse_evaluator,
                                       trainRatio=0.8)


lr_model = train_val_split.fit(train_data)

best_lr_model = lr_model.bestModel

lr_predictions = best_lr_model.transform(test_data)

lr_rmse = rmse_evaluator.evaluate(lr_predictions)  # RMSE
lr_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(lr_predictions)  # R²
lr_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(lr_predictions)  # MAE
lr_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(lr_predictions)  # MSE

# Print all evaluation metrics
print(f"Linear Regression - Best RMSE: {lr_rmse}")
print(f"Linear Regression - Best R2: {lr_r2}")
print(f"Linear Regression - Best MAE: {lr_mae}")
print(f"Linear Regression - Best MSE: {lr_mse}")




Linear Regression - Best RMSE: 2.439849197930256
Linear Regression - Best R2: 0.6993260639022114
Linear Regression - Best MAE: 1.64979592541034
Linear Regression - Best MSE: 5.952864108640914
CPU times: user 492 ms, sys: 145 ms, total: 637 ms
Wall time: 6min 3s


                                                                                

In [14]:
lr_predictions.write.mode("overwrite").parquet("gs://bucket121024/pipeline3/5_LR.parquet")

                                                                                

### Hyperparameter Tuning for Gradient Boosted Trees (GBT)

To optimize the performance of the Gradient Boosted Trees (GBT) model, a minimal hyperparameter tuning process was implemented using the following key parameters:

1. **`maxDepth`**:
   - Controls the maximum depth of the trees.
   - A deeper tree can model more complex relationships but risks overfitting.
   - Values tested: `[3, 5]`.

2. **`maxIter`**:
   - Determines the number of boosting iterations.
   - More iterations allow the model to refine predictions but increase computation time.
   - Values tested: `[10, 20]`.

3. **`stepSize`**:
   - Represents the learning rate for gradient boosting.
   - Smaller step sizes make the model converge more slowly but can improve generalization.
   - Values tested: `[0.05, 0.1]`.

The chosen hyperparameter ranges strike a balance between computational efficiency and model performance, ensuring scalability for large datasets.


In [28]:
train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.repartition(20)
test_data = test_data.repartition(20)

In [29]:
%%time

from pyspark.ml.regression import GBTRegressor

# Initialize GBT Regressor
gbt = GBTRegressor(featuresCol="scaledFeatures", labelCol="demand")

gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])

# hyperparameter grid for tuning
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [3, 5]) 
              .addGrid(gbt.maxIter, [10, 20])
              .addGrid(gbt.stepSize, [0.05, 0.1])
              .build())

evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
train_val_split = TrainValidationSplit(estimator=gbt_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)
gbt_model = train_val_split.fit(train_data)
best_gbt_model = gbt_model.bestModel

gbt_predictions = best_gbt_model.transform(test_data)

gbt_rmse = evaluator.evaluate(gbt_predictions)  # RMSE
gbt_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(gbt_predictions)  # R²
gbt_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(gbt_predictions)  # MAE
gbt_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(gbt_predictions)  # MSE

# Print evaluation metrics
print(f"GBT Regressor - Best RMSE: {gbt_rmse}")
print(f"GBT Regressor - Best R2: {gbt_r2}")
print(f"GBT Regressor - Best MAE: {gbt_mae}")
print(f"GBT Regressor - Best MSE: {gbt_mse}")


24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_785_19 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_11_9 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_11_19 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_827_9 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_827_19 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_11_7 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_785_9 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_785_7 !
24/12/22 01:02:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_827_7 !
24/12/22 01:02:06 WARN YarnAllocator: Container from a bad node: container_1734808064864_0002_01_000005 on host: cluster-20f0-w-2.us-central1-c.c.wired-as

GBT Regressor - Best RMSE: 2.063421182984908
GBT Regressor - Best R2: 0.7849228124984564
GBT Regressor - Best MAE: 1.3436615856728804
GBT Regressor - Best MSE: 4.2577069783908374
CPU times: user 4.51 s, sys: 1.09 s, total: 5.61 s
Wall time: 52min 30s


                                                                                

In [30]:

train_data = spark.read.parquet(train)
test_data = spark.read.parquet(test)
train_data = train_data.repartition(20)
test_data = test_data.repartition(20)
train_data = train_data.sample(withReplacement=False, fraction=0.8)
test_data = test_data.sample(withReplacement=False, fraction=0.8)


In [None]:
%%time

from pyspark.ml.regression import GBTRegressor

# Initialize GBT Regressor
gbt = GBTRegressor(featuresCol="scaledFeatures", labelCol="demand")

gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])

# hyperparameter grid for tuning
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [3, 5]) 
              .addGrid(gbt.maxIter, [10, 20])
              .addGrid(gbt.stepSize, [0.05, 0.1])
              .build())

evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")
train_val_split = TrainValidationSplit(estimator=gbt_pipeline,
                                       estimatorParamMaps=param_grid,
                                       evaluator=evaluator,
                                       trainRatio=0.8)
gbt_model = train_val_split.fit(train_data)
best_gbt_model = gbt_model.bestModel

gbt_predictions = best_gbt_model.transform(test_data)

gbt_rmse = evaluator.evaluate(gbt_predictions)  # RMSE
gbt_r2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2").evaluate(gbt_predictions)  # R²
gbt_mae = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mae").evaluate(gbt_predictions)  # MAE
gbt_mse = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="mse").evaluate(gbt_predictions)  # MSE

# Print evaluation metrics
print(f"GBT Regressor - Best RMSE: {gbt_rmse}")
print(f"GBT Regressor - Best R2: {gbt_r2}")
print(f"GBT Regressor - Best MAE: {gbt_mae}")
print(f"GBT Regressor - Best MSE: {gbt_mse}")


