In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Ride Trips").getOrCreate()

#Creating DataFrame

In [3]:
data = [
("T001","Amit","Hyderabad","Ramesh","Sedan",12.5,320,28,"UPI","Completed"),
("T002","Neha","Bangalore","Suresh","Mini",8.2,210,22,"Card","Completed"),
("T003","Rahul","Delhi","Anil","Bike",5.1,120,15,"Cash","Completed"),
("T004","Pooja","Mumbai","Vikas","SUV",18.0,560,45,"UPI","Cancelled"),
("T005","Arjun","Chennai","Kumar","Mini",7.8,200,20,"UPI","Completed"),
("T006","Sneha","Hyderabad","Ramesh","Sedan",14.2,360,32,"Card","Completed"),
("T007","Karan","Delhi","Anil","Bike",6.3,140,18,"UPI","Completed"),
("T008","Riya","Bangalore","Suresh","Sedan",11.0,300,27,"Wallet","Completed"),
("T009","Vikas","Mumbai","Vijay","SUV",20.5,650,50,"Card","Completed"),
("T010","Anjali","Chennai","Kumar","Bike",4.9,110,14,"Cash","Completed"),
("T011","Farhan","Delhi","Anil","Mini",9.6,240,25,"UPI","Completed"),
("T012","Megha","Hyderabad","Ramesh","SUV",19.2,610,48,"Card","Cancelled"),
("T013","Suresh","Bangalore","Suresh","Sedan",13.0,340,30,"UPI","Completed"),
("T014","Divya","Mumbai","Vikas","Mini",10.2,260,26,"Wallet","Completed"),
("T015","Nikhil","Delhi","Anil","Sedan",15.5,390,34,"UPI","Completed"),
("T016","Kavya","Chennai","Kumar","Sedan",12.1,315,29,"UPI","Completed"),
("T017","Rohit","Hyderabad","Ramesh","SUV",22.0,700,55,"Card","Completed"),
("T018","Simran","Bangalore","Suresh","Bike",5.8,130,16,"Cash","Completed"),
("T019","Ayesha","Mumbai","Vijay","Mini",9.9,250,24,"UPI","Completed"),
("T020","Manish","Delhi","Anil","Bike",6.0,135,17,"Wallet","Completed"),
("T021","Priya","Hyderabad","Ramesh","Sedan",14.8,380,33,"Card","Completed"),
("T022","Yash","Chennai","Kumar","SUV",21.3,680,52,"UPI","Completed"),
("T023","Naina","Bangalore","Suresh","Mini",10.7,270,28,"UPI","Completed"),
("T024","Sameer","Mumbai","Vikas","Sedan",13.9,350,31,"Wallet","Completed"),
("T025","Ritika","Delhi","Anil","Bike",5.4,125,16,"Cash","Completed"),
("T026","Gopal","Hyderabad","Ramesh","Mini",8.9,225,23,"UPI","Completed"),
("T027","Tina","Bangalore","Suresh","Sedan",12.6,330,29,"Card","Completed"),
("T028","Irfan","Mumbai","Vijay","SUV",23.4,740,58,"Card","Completed"),
("T029","Sahil","Chennai","Kumar","Mini",9.4,235,24,"UPI","Completed"),
("T030","Lavanya","Delhi","Anil","Sedan",14.1,365,32,"Wallet","Completed"),
("T031","Deepak","Hyderabad","Ramesh","Bike",6.7,150,18,"Cash","Complwted"),
("T032","Shweta","Bangalore","Suresh","Mini",10.0,255,26,"UPI","Completed"),
("T033","Aman","Mumbai","Vikas","Sedan",15.8,395,35,"Card","Completed"),
("T034","Rekha","Chennai","Kumar","Sedan",13.5,345,30,"UPI","Completed"),
("T035","Zubin","Delhi","Anil","SUV",24.0,760,60,"Card","Completed"),
("T036","Pallavi","Hyderabad","Ramesh","Mini",9.1,230,23,"Wallet","Completed"),
("T037","Naveen","Bangalore","Suresh","Bike",5.9,135,17,"UPI","Completed"),
("T038","Sonia","Mumbai","Vijay","SUV",21.7,690,54,"Card","Completed"),
("T039","Harish","Chennai","Kumar","Mini",8.5,215,21,"Cash","Completed"),
("T040","Kriti","Delhi","Anil","Sedan",14.6,375,33,"UPI","Completed"),
("T041","Apoorva","Hyderabad","Ramesh","Sedan",13.2,335,30,"Card","Completed"),
("T042","Mohit","Bangalore","Suresh","SUV",19.9,620,49,"UPI","Completed"),
("T043","Tanvi","Mumbai","Vikas","Mini",10.4,265,27,"Wallet","Completed"),
("T044","Rakesh","Chennai","Kumar","Bike",6.2,140,18,"Cash","Completed"),
("T045","Isha","Delhi","Anil","Mini",9.7,245,25,"UPI","Completed")
]
columns = [
"trip_id","rider_name","city","driver_name","vehicle_type","distance_km","trip_fare","trip_duration_minutes","payment_mode","trip_status"
]

df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T001|      Amit|Hyderabad|     Ramesh|       Sedan|       12.5|      320|                   28|         UPI|  Completed|
|   T002|      Neha|Bangalore|     Suresh|        Mini|        8.2|      210|                   22|        Card|  Completed|
|   T003|     Rahul|    Delhi|       Anil|        Bike|        5.1|      120|                   15|        Cash|  Completed|
|   T004|     Pooja|   Mumbai|      Vikas|         SUV|       18.0|      560|                   45|         UPI|  Cancelled|
|   T005|     Arjun|  Chennai|      Kumar|        Mini|        7.8|      200|                   20|         UPI|  Completed|


#Section A - CSV

In [5]:
#write the full dataset to CSV with header enabled
df.write.mode("overwrite").option("header",True).csv("trips_csv")

In [6]:
"""Read the CSV and filter:
trip_fare > 400
trip_status = "Completed" """

df_csv = spark.read.option("header",True).option("inferSchema",True).csv("trips_csv")
df_csv.filter((col("trip_fare") > 400) & (col("trip_status") == "Completed")).show()

+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T028|     Irfan|   Mumbai|      Vijay|         SUV|       23.4|      740|                   58|        Card|  Completed|
|   T035|     Zubin|    Delhi|       Anil|         SUV|       24.0|      760|                   60|        Card|  Completed|
|   T038|     Sonia|   Mumbai|      Vijay|         SUV|       21.7|      690|                   54|        Card|  Completed|
|   T042|     Mohit|Bangalore|     Suresh|         SUV|       19.9|      620|                   49|         UPI|  Completed|
|   T009|     Vikas|   Mumbai|      Vijay|         SUV|       20.5|      650|                   50|        Card|  Completed|


In [7]:
"""From CSV, select:
trip_id
city
vehicle_type
trip_fare
Sort by trip_fare descending."""

df_csv.select("trip_id","city","vehicle_type","trip_fare").orderBy(col("trip_fare").desc()).show()

+-------+---------+------------+---------+
|trip_id|     city|vehicle_type|trip_fare|
+-------+---------+------------+---------+
|   T035|    Delhi|         SUV|      760|
|   T028|   Mumbai|         SUV|      740|
|   T017|Hyderabad|         SUV|      700|
|   T038|   Mumbai|         SUV|      690|
|   T022|  Chennai|         SUV|      680|
|   T009|   Mumbai|         SUV|      650|
|   T042|Bangalore|         SUV|      620|
|   T012|Hyderabad|         SUV|      610|
|   T004|   Mumbai|         SUV|      560|
|   T033|   Mumbai|       Sedan|      395|
|   T015|    Delhi|       Sedan|      390|
|   T021|Hyderabad|       Sedan|      380|
|   T040|    Delhi|       Sedan|      375|
|   T030|    Delhi|       Sedan|      365|
|   T006|Hyderabad|       Sedan|      360|
|   T024|   Mumbai|       Sedan|      350|
|   T034|  Chennai|       Sedan|      345|
|   T013|Bangalore|       Sedan|      340|
|   T041|Hyderabad|       Sedan|      335|
|   T027|Bangalore|       Sedan|      330|
+-------+--

In [8]:
#Write only Bike trips to CSV using delimiter | .

df_csv.filter(col("vehicle_type") == "Bike").write.mode("overwrite").option("header",True).option("delimiter","|").csv("bike_trips_csv")

#Section b - Json

In [9]:
#Write only trips from Mumbai to Json
df.write.mode("overwrite").option("header",True).json("trips_json")

In [12]:
"""Read JSON and add a column:
fare_per_km = trip_fare / distance_km
Write back to JSON."""

json_df = spark.read.json("trips_json")
json_df = json_df.withColumn("fare_per_km",col("trip_fare")/col("distance_km"))
json_df.write.mode("overwrite").json("updated_trips_json")

In [13]:
"""Filter JSON data:
payment_mode = "Card"
vehicle_type = "SUV" """

json_df.filter((col("payment_mode") == "Card") & (col("vehicle_type") == "SUV")).show()

+---------+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+------------------+
|     city|distance_km|driver_name|payment_mode|rider_name|trip_duration_minutes|trip_fare|trip_id|trip_status|vehicle_type|       fare_per_km|
+---------+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+------------------+
|   Mumbai|       23.4|      Vijay|        Card|     Irfan|                   58|      740|   T028|  Completed|         SUV|31.623931623931625|
|    Delhi|       24.0|       Anil|        Card|     Zubin|                   60|      760|   T035|  Completed|         SUV|31.666666666666668|
|   Mumbai|       21.7|      Vijay|        Card|     Sonia|                   54|      690|   T038|  Completed|         SUV|31.797235023041477|
|   Mumbai|       20.5|      Vijay|        Card|     Vikas|                   50|      650|   T009|  Completed|         SUV| 31.70731707

In [14]:
#Force JSON output into a single partition and observe the output structure.

json_df.coalesce(1).write.mode("overwrite").json("single_partition_trip_json")

#Section C - Parquet

In [16]:
#Convert full dataset to Parquet.
df.write.mode("overwrite").parquet("trips_parquet")

In [17]:
#Read Parquet and filter: trip_duration_minutes > 45
parquet_df = spark.read.parquet("trips_parquet")
parquet_df.filter(col("trip_duration_minutes") >45).show()

+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T028|     Irfan|   Mumbai|      Vijay|         SUV|       23.4|      740|                   58|        Card|  Completed|
|   T035|     Zubin|    Delhi|       Anil|         SUV|       24.0|      760|                   60|        Card|  Completed|
|   T038|     Sonia|   Mumbai|      Vijay|         SUV|       21.7|      690|                   54|        Card|  Completed|
|   T042|     Mohit|Bangalore|     Suresh|         SUV|       19.9|      620|                   49|         UPI|  Completed|
|   T009|     Vikas|   Mumbai|      Vijay|         SUV|       20.5|      650|                   50|        Card|  Completed|


In [18]:
#Sort Parquet data by distance_km descending and write top 10 trips back to Parquet.
parquet_df.orderBy(col("distance_km").desc()).limit(10).write.mode("overwrite").parquet("top10_distance_trips")

#Format Conversion

In [19]:
#CSV → Parquet
df_csv.write.mode("overwrite").parquet("trips_csv_to_parquet")

In [20]:
#JSON → Parquet
json_df.write.mode("overwrite").parquet("trips_json_to_parquet")

In [21]:
#Read Parquet and write it back as CSV with header and delimiter ","
parquet_df.write.mode("overwrite").option("header",True).option("delimiter",",").csv("trips_parquet_to_csv")


#Analytical Thinking Questions

In [22]:
#Which city generates the highest total trip_fare?
from pyspark.sql.functions import sum,col

df.groupBy("city").agg(sum("trip_fare").alias("total_fare")).orderBy(col("total_fare").desc()).show(1)

+------+----------+
|  city|total_fare|
+------+----------+
|Mumbai|      4160|
+------+----------+
only showing top 1 row


In [23]:
#Which vehicle_type has the highest average fare?
from pyspark.sql.functions import avg

df.groupBy("vehicle_type").agg(avg("trip_fare").alias("avg_fare")).orderBy(col("avg_fare").desc()).show(1)

+------------+-----------------+
|vehicle_type|         avg_fare|
+------------+-----------------+
|         SUV|667.7777777777778|
+------------+-----------------+
only showing top 1 row


In [24]:
#Which driver has completed the most trips?
df.filter(col("trip_status") == "Completed").groupBy("driver_name").count().orderBy(col("count").desc()).show(1)

+-----------+-----+
|driver_name|count|
+-----------+-----+
|       Anil|   10|
+-----------+-----+
only showing top 1 row


#Optional Challenge

In [25]:
#Repartition the dataset into 4 partitions and write to Parquet.
df.repartition(4).write.mode("overwrite").parquet("trips_repartitioned4")

In [26]:
"""Create a summary dataset with:
city
total_trips
total_revenue

average_trip_duration
Write it to Parquet."""

from pyspark.sql.functions import count,sum,avg

summary_df = df.groupBy("city").agg(count("trip_id").alias("total_trips"),sum("trip_fare").alias("total_revenue"),avg("trip_duration_minutes").alias("average_trip_duration"))
summary_df.write.mode("overwrite").parquet("summary_parquet")

summary_df.show()

+---------+-----------+-------------+---------------------+
|     city|total_trips|total_revenue|average_trip_duration|
+---------+-----------+-------------+---------------------+
|Bangalore|          9|         2590|    27.11111111111111|
|  Chennai|          8|         2240|                 26.0|
|   Mumbai|          9|         4160|   38.888888888888886|
|    Delhi|         10|         2895|                 27.5|
|Hyderabad|          9|         3310|    32.22222222222222|
+---------+-----------+-------------+---------------------+

