In [0]:
df = spark.read.table("nyc_taxi.idk.yellow_trips_csv_v")
display(df)
df.printSchema()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import radians, sin, cos, atan2, sqrt
R=3959 #radius of earth in miles
df_clean=(
    df
    .filter((df.fare_amount > 0) & (df.fare_amount<500))
    .filter((F.col("trip_distance") > 0) & (F.col("trip_distance") < 100)) #filter out outlier trips
    .withColumn("pickup_hour", F.hour(df.tpep_pickup_datetime))
    .withColumn("pickup_dow", F.dayofweek(df.tpep_pickup_datetime))
    .withColumn("pickup_ts",  F.unix_timestamp("tpep_pickup_datetime"))
    .withColumn("dropoff_ts", F.unix_timestamp("tpep_dropoff_datetime"))
    .withColumn("trip_duration_min", (F.col("dropoff_ts") - F.col("pickup_ts")) / 60.0)
    .filter((F.col("trip_duration_min") > 0) & (F.col("trip_duration_min") < 300 ))#filter out outlier trip durations
    
    
)
df_clean=(
    df_clean
    .withColumn("pick_lat", radians(df_clean.pickup_latitude))
    .withColumn("pick_long", radians(df_clean.pickup_longitude))
    .withColumn("drop_lat", radians(df_clean.dropoff_latitude))
    .withColumn("drop_long", radians(df_clean.dropoff_longitude))
)
df_clean=(
    df_clean
    .withColumn("lat_diff", df_clean.pick_lat - df_clean.drop_lat)
    .withColumn("long_diff", df_clean.pick_long - df_clean.drop_long)
)
df_clean=(
    df_clean
    .withColumn("a", sin(df_clean.lat_diff/2)**2 + cos(df_clean.pick_lat)*cos(df_clean.drop_lat)*sin(df_clean.long_diff/2)**2)
    .withColumn("c", 2*atan2(sqrt(F.col("a")), sqrt(1-F.col("a"))))
)
df_clean=(
    df_clean
    .withColumn("straight_line_distance", df_clean.c*R)

)
df_clean=df_clean.drop("pick_lat", "pick_long", "drop_lat", "drop_long", "lat_diff", "long_diff", "a", "c")
df_clean=df_clean.filter(F.col("straight_line_distance")<125)#filter outlier trips
df_clean=df_clean.withColumn("high_fare", F.when(df_clean.fare_amount >20,1).otherwise(0))
numericFeatures = [
    "passenger_count",
    "trip_distance",
    "trip_duration_min",
    "pickup_hour",
    "pickup_dow",
    "straight_line_distance"
]



#remove nulls for cols used as features + label
colsToKeepNotNull = numericFeatures +  ["fare_amount"]

df_clean = df_clean.na.drop(subset=colsToKeepNotNull)
train_df, test_df = df_clean.randomSplit([0.7,0.3], seed=42)
display(df_clean)
df_clean.groupBy("high_fare").count().show()


In [0]:
#task 1 pipeline1

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import os

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/ml_storage/ml_schema/pipeline1"

cols = ["pickup_hour", "pickup_dow", "trip_distance", "straight_line_distance","trip_duration_min"]

assembler = VectorAssembler(
    inputCols = cols,
    outputCol = "features"
)

scaler = StandardScaler(
    inputCol = "features",
    outputCol = "scaled_features"
)

decision_tree = DecisionTreeClassifier(
    labelCol = "high_fare",
    featuresCol = "scaled_features"
)

pipeline = Pipeline(stages=[assembler, scaler, decision_tree])

param_grid = (ParamGridBuilder()
    .addGrid(decision_tree.maxDepth, [2, 5, 10])
    .addGrid(decision_tree.minInstancesPerNode, [1, 2])
    .build())

evaluator = MulticlassClassificationEvaluator(
    labelCol="high_fare",
    predictionCol="prediction",
    metricName="f1"
)

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3
)

dt_model = cv.fit(train_df)
dt_predictions = dt_model.transform(test_df)

f1 = evaluator.evaluate(dt_predictions)
precision_val = evaluator.setMetricName("weightedPrecision").evaluate(dt_predictions)
recall_val = evaluator.setMetricName("weightedRecall").evaluate(dt_predictions)
accuracy_val = evaluator.setMetricName("accuracy").evaluate(dt_predictions)

print("Decision Tree F1 Score:", f1)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("Accuracy:", accuracy_val)

display(dt_predictions.select("fare_amount", "high_fare", "prediction", "pickup_hour", "pickup_dow"))


# Save the best model, might need to change depending on where youre saving it
dt_model.bestModel.write().overwrite().save("/Volumes/ml_storage/ml_schema/ml_volume/models/dt_classifier_pipeline/")

print("Model saved successfully!")


In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import os

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/ml_storage/ml_schema/pipeline2"

cols = ["pickup_hour", "pickup_dow", "trip_distance", "straight_line_distance","trip_duration_min"]

assembler_lr = VectorAssembler(
    inputCols = cols,
    outputCol = "features"
)

scaler_lr = StandardScaler(
    inputCol = "features",
    outputCol = "scaled_features"
)

log_reg = LogisticRegression(
    labelCol = "high_fare",
    featuresCol = "scaled_features"
)

lr_pipeline = Pipeline(stages = [assembler_lr, scaler_lr, log_reg])

lr_param_grid = (ParamGridBuilder()
                 .addGrid(log_reg.regParam, [0.01, 0.1])
                 .addGrid(log_reg.maxIter, [20, 100])
                 .build())

lr_evaluator = MulticlassClassificationEvaluator(
    labelCol = "high_fare",
    predictionCol = "prediction",
    metricName = "f1"
)

lr_cv = CrossValidator(
    estimator = lr_pipeline,
    estimatorParamMaps = lr_param_grid,
    evaluator = lr_evaluator,
    numFolds = 3
)

lr_model = lr_cv.fit(train_df)
lr_pred = lr_model.transform(test_df)

f1_lr = lr_evaluator.evaluate(lr_pred)
precision_lr = lr_evaluator.setMetricName("weightedPrecision").evaluate(lr_pred)
recall_lr = lr_evaluator.setMetricName("weightedRecall").evaluate(lr_pred)
accuracy_lr = lr_evaluator.setMetricName("accuracy").evaluate(lr_pred)

print("Logistic Regression F1 Score:", f1_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("Accuracy:", accuracy_lr)

display(lr_pred.select("fare_amount", "high_fare", "prediction", "pickup_hour", "pickup_dow"))

lr_model.bestModel.write().overwrite().save("/Volumes/ml_storage/ml_schema/ml_volume/models/lr_classifier_pipeline/")
print("Model saved successfully!")

In [0]:
import matplotlib.pyplot as plt
import numpy as np
accuracy_lr= 0.8864278378681718
precision_lr= 0.7857565145535814
recall_lr= 0.8864278378681718
f1_lr= 0.833061794027467

accuracy_val= 0.9903350688617122
precision_val= 0.9903077071022511
recall_val= 0.9903350688617122
f1= 0.9903197664579257

metrics=["Accuracy", "Precision", "Recall", "F1 Score"]
models = ["Decision Tree", "Logistic Regression"]
lr_scores=[accuracy_lr, precision_lr, recall_lr, f1_lr]
dt_scores=[accuracy_val, precision_val, recall_val, f1]

position=np.arange(len(metrics))
plt.figure(figsize=(10, 5))
plt.bar(position-.2, dt_scores, 0.35, label="Decision Tree")
plt.bar(position +.2, lr_scores, 0.35, label="Logistic Regression")

plt.ylabel("Scores")
plt.title("Model Comparison: Decision Tree vs Logistic Regression")
plt.xticks(position, metrics)
plt.legend()
plt.show()

In [0]:

#2 pipeline 1
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
numericFeatures = [
    "trip_distance",
    "straight_line_distance",
    "trip_duration_min",
    "pickup_hour",
    "pickup_dow"
]





#assemble all features
assembler = VectorAssembler(
    inputCols=numericFeatures,
    outputCol="features_unscaled"
)

#scale for linear regression
#withMean=False keeps vector sparse to avoid huge memory usage
scaler = StandardScaler(
    inputCol="features_unscaled",
    outputCol="features",
    withMean=False, #True
    withStd=True
)

#linear regression model
lr = LinearRegression(
    featuresCol="features",
    labelCol="fare_amount",
    predictionCol="prediction"
)
train_df, test_df = df_clean.randomSplit([0.7,0.3], seed=42)
train_small = train_df.limit(100000) #due to memory limits we had to use a sample of 100k to be able to do the Cv

#full linear regression pipeline
lrPipeline = Pipeline(
    stages=[assembler, scaler, lr]
)

In [0]:
# Check for insane values in the features
test_df.select(
    F.max("trip_duration_min"), 
    F.max("straight_line_distance"), 
    F.max("trip_distance")
).show()

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

#param grid
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.0, 0.0001, 0.001, 0.01])
    .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5])
    .addGrid(lr.maxIter, [50, 100])
    .build()
)

evaluatorRmse = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="rmse"
)

cvLr = CrossValidator(
    estimator=lrPipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluatorRmse,
    numFolds=2,     #lower if needed
    parallelism=1,  #lower if needed
    collectSubModels=False
)

In [0]:
import os

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/ml_storage/ml_schema/ml_volume/sparkml_tmp"
cvLrModel = cvLr.fit(train_small)

bestLrPipelineModel = cvLrModel.bestModel
bestLrModel = bestLrPipelineModel.stages[-1]

In [0]:

bestLrPipelineModel.write().overwrite().save(
    "/Volumes/ml_storage/ml_schema/ml_volume/models/linear_regression_pipeline/"
)

In [0]:
#predictions on test set
testPredLr = bestLrPipelineModel.transform(test_df)

#evaluation metrics
rmseTest = evaluatorRmse.evaluate(testPredLr)
evaluatorR2 = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="r2"
)
r2Test = evaluatorR2.evaluate(testPredLr)

print("Linear Regression Evaluation Metrics:")
print("RMSE:", rmseTest)
print("R2:", r2Test)

#predictions vs actual
testPredLr.select("fare_amount", "prediction").show(10)

print("Best Linear Regression Hyperparameters")
print("regParam:", bestLrModel.getRegParam())
print("elasticNetParam:", bestLrModel.getElasticNetParam())
print("maxIter:", bestLrModel.getMaxIter())

In [0]:
display(testPredLr.select("fare_amount", "prediction"))

Databricks visualization. Run in Databricks to view.

In [0]:
#2 pipeline 2

from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="fare_amount",
    predictionCol="prediction",
    seed=42
)

rfPipeline = Pipeline(
    stages=[assembler, scaler, rf]
)

rfparamGrid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [20, 50])
    .addGrid(rf.maxDepth, [5, 8])
    .build()
)

rfCV = CrossValidator(
    estimator=rfPipeline,
    estimatorParamMaps=rfparamGrid,
    evaluator=evaluatorRmse, #same RMSE evaluator
    numFolds=3,     #lower if needed
)

In [0]:

rfCvModel = rfCV.fit(train_small)

bestRfPipelineModel = rfCvModel.bestModel
bestRfModel = bestRfPipelineModel.stages[-1]

In [0]:
#predictions on test set
testPredRf = bestRfPipelineModel.transform(test_df)

#evaluation metrics
rmseTest = evaluatorRmse.evaluate(testPredRf)
evaluatorR2 = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="r2"
)
r2Test = evaluatorR2.evaluate(testPredRf)

print("Random Forest Regressor Pipeline Evaluation Metrics:")
print("RMSE:", rmseTest)
print("R2:", r2Test)

#predictions vs actual
testPredRf.select("fare_amount", "prediction").show(10)

In [0]:
display(testPredRf.select("fare_amount", "prediction"))

Databricks visualization. Run in Databricks to view.

In [0]:
print("Best Random Forest Hyperparameters")
print("numTrees:", bestRfModel.getNumTrees)
print("maxDepth:", bestRfModel.getMaxDepth())

#save the best model, might need to change depending on where youre saving it
bestRfPipelineModel.write().overwrite().save("/Volumes/ml_storage/ml_schema/ml_volume/models/RfR_pipeline")

print("Model saved successfully!")

In [0]:
from pyspark.ml.pipeline import PipelineModel
#how to reload models
logistic_reg = PipelineModel.load("/Volumes/ml_storage/ml_schema/ml_volume/models/lr_classifier_pipeline")
rfr_load = PipelineModel.load("/Volumes/ml_storage/ml_schema/ml_volume/models/RfR_pipeline")
dt_tree = PipelineModel.load("/Volumes/ml_storage/ml_schema/ml_volume/models/dt_classifier_pipeline")
linear_reg = PipelineModel.load("/Volumes/ml_storage/ml_schema/ml_volume/models/linear_regression_pipeline")

#example

logistic_reg.transform(test_df).select("high_fare","prediction").show(10)
