In [28]:
#TASK 2

import os
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# ======================
# CONFIG
# ======================
INPUT_CSV = r"C:\Users\basun\OneDrive\Desktop\Programming practice\bigdata_project\data\nyc_taxi\yellow_tripdata_sample.csv"
OUTPUT_DIR = r"C:\Users\basun\OneDrive\Desktop\Programming practice\bigdata_project\outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ======================
# START SPARK
# ======================
spark = SparkSession.builder.appName("Task2_FarePrediction").getOrCreate()
print("Spark ready ✨", spark.version)

# ======================
# LOAD CSV
# ======================
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv(INPUT_CSV))

print("Raw row count:", df.count())
df.show(5)

# ======================
# ETL (cleaning + features)
# ======================
from pyspark.sql.functions import to_timestamp, col, unix_timestamp, hour, dayofmonth, dayofweek

df = (df
      .withColumn("tpep_pickup_datetime", to_timestamp(col("tpep_pickup_datetime")))
      .withColumn("tpep_dropoff_datetime", to_timestamp(col("tpep_dropoff_datetime")))
     )

df = df.withColumn(
    "duration_min",
    (unix_timestamp(col("tpep_dropoff_datetime")) - unix_timestamp(col("tpep_pickup_datetime"))) / 60.0
)

df = df.filter(
    (col("trip_distance") > 0) &
    (col("trip_distance") < 100) &
    (col("duration_min") > 0) &
    (col("duration_min") < 500)
)

df = (df
      .withColumn("hour", hour(col("tpep_pickup_datetime")))
      .withColumn("day", dayofmonth(col("tpep_pickup_datetime")))
      .withColumn("dow", dayofweek(col("tpep_pickup_datetime")))
     )

print("Filtered row count:", df.count())
df.show(5)

# ======================
# FEATURES + LABEL
# ======================
feature_cols = ["trip_distance", "duration_min", "hour", "day", "dow"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_ml = assembler.transform(df).select("features", "fare_amount")

# ======================
# TRAIN/TEST SPLIT
# ======================
train, test = df_ml.randomSplit([0.8, 0.2], seed=42)

# ======================
# 1) Linear Regression
# ======================
lr = LinearRegression(featuresCol="features", labelCol="fare_amount")
lr_model = lr.fit(train)
preds_lr = lr_model.transform(test)

evaluator = RegressionEvaluator(labelCol="fare_amount", predictionCol="prediction", metricName="rmse")
rmse_lr = evaluator.evaluate(preds_lr)

print("\n✅ Linear Regression Model")
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)
print("RMSE on test:", rmse_lr)

# Save LR predictions
preds_lr.select("fare_amount", "prediction").toPandas().to_csv(os.path.join(OUTPUT_DIR, "predictions_linear.csv"), index=False)

# ======================
# 2) Random Forest Regression
# ======================
rf = RandomForestRegressor(featuresCol="features", labelCol="fare_amount", numTrees=50, maxDepth=10)
rf_model = rf.fit(train)
preds_rf = rf_model.transform(test)

rmse_rf = evaluator.evaluate(preds_rf)

print("\n🌲 Random Forest Model")
print("Feature Importances:", rf_model.featureImportances)
print("RMSE on test:", rmse_rf)

# Save RF predictions
preds_rf.select("fare_amount", "prediction").toPandas().to_csv(os.path.join(OUTPUT_DIR, "predictions_randomforest.csv"), index=False)

# ======================
# SUMMARY
# ======================
print("\n📊 Final Results")
print(f"Linear Regression RMSE: {rmse_lr:.2f}")
print(f"Random Forest RMSE: {rmse_rf:.2f}")
print("✅ Predictions saved to:", OUTPUT_DIR)



Spark ready ✨ 4.0.0
Raw row count: 1000
+--------------------+---------------------+---------------+-------------+------------+------------+-----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|
+--------------------+---------------------+---------------+-------------+------------+------------+-----------+
| 2019-01-01 00:00:00|  2019-01-01 00:05:00|              1|          0.5|           0|           0|        5.0|
| 2019-01-01 00:01:00|  2019-01-01 00:06:00|              2|          1.5|           1|           1|        6.0|
| 2019-01-01 00:02:00|  2019-01-01 00:07:00|              3|          2.5|           2|           2|        7.0|
| 2019-01-01 00:03:00|  2019-01-01 00:08:00|              4|          3.5|           3|           3|        8.0|
| 2019-01-01 00:04:00|  2019-01-01 00:09:00|              1|          4.5|           4|           4|        9.0|
+--------------------+---------------------+------------