# Spark MLlib Assignment
This notebook completes the assignment tasks step by step using PySpark and MLlib.

In [3]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()

In [4]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TaxiFarePrediction").getOrCreate()

df = spark.read.csv("2019 Taxi dataset.csv", header=True, inferSchema=True)

df = df.select("passenger_count", "PULocationID", "DOLocationID", "total_amount")

df = df.dropna()

df.show(10)

+---------------+------------+------------+------------+
|passenger_count|PULocationID|DOLocationID|total_amount|
+---------------+------------+------------+------------+
|              1|         239|         239|         8.8|
|              1|         230|         100|         8.3|
|              1|          68|         127|       47.75|
|              1|          68|          68|         7.3|
|              1|          50|          42|       23.15|
|              1|          95|         196|         9.8|
|              1|         211|         211|         6.8|
|              1|         237|         162|         7.8|
|              1|         148|          37|        20.3|
|              1|         265|         265|        0.31|
+---------------+------------+------------+------------+



In [5]:

trainDF, testDF = df.randomSplit([0.8, 0.2], seed=42)

In [6]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

assembler = VectorAssembler(inputCols=["passenger_count", "PULocationID", "DOLocationID"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="total_amount")

In [7]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[assembler, dt])

In [8]:
model = pipeline.fit(trainDF)

In [9]:
predictions = model.transform(testDF)

predictions.select("passenger_count", "PULocationID", "DOLocationID", "total_amount", "prediction").show(10)

+---------------+------------+------------+------------+-----------------+
|passenger_count|PULocationID|DOLocationID|total_amount|       prediction|
+---------------+------------+------------+------------+-----------------+
|              1|          68|         127|       47.75|              7.3|
|              1|         230|         100|         8.3|              7.8|
|              1|         239|         239|         8.8|6.800000000000001|
+---------------+------------+------------+------------+-----------------+



In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="total_amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 23.384129233306936
