<a href="https://colab.research.google.com/github/Anish-jx/Mini-Project/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("UberOlaCaseStudy") \
    .getOrCreate()
data = [
    (1, "2023-09-01 08:15:00", "2023-09-01 08:45:00", "Koramangala", "Whitefield", "Bangalore", 350.0, "Online"),
    (2, "2023-09-01 09:10:00", "2023-09-01 09:40:00", "Andheri", "Bandra", "Mumbai", 220.0, "Cash"),
    (3, "2023-09-01 18:30:00", "2023-09-01 19:00:00", "Indiranagar", "Electronic City", "Bangalore", 480.0, "Online"),
    (4, "2023-09-02 19:15:00", "2023-09-02 19:45:00", "Powai", "Thane", "Mumbai", 560.0, "Online"),
    (5, "2023-09-03 08:45:00", "2023-09-03 09:05:00", "Salt Lake", "Howrah", "Kolkata", 150.0, "Cash"),
    (6, "2023-09-03 09:30:00", "2023-09-03 10:00:00", "Koramangala", "MG Road", "Bangalore", 200.0, "Online"),
    (7, "2023-09-04 18:45:00", "2023-09-04 19:30:00", "Andheri", "Airport", "Mumbai", 600.0, "Cash"),
    (8, "2023-09-04 20:15:00", "2023-09-04 20:45:00", "Salt Lake", "Airport", "Kolkata", 300.0, "Online"),
    (9, "2023-09-05 07:50:00", "2023-09-05 08:20:00", "Indiranagar", "Koramangala", "Bangalore", 180.0, "Cash"),
    (10,"2023-09-05 09:15:00", "2023-09-05 09:45:00", "Bandra", "Dadar", "Mumbai", 250.0, "Online")
]

schema = StructType([
    StructField("Trip_ID",      StringType(), True),
    StructField("Pickup_DateTime", StringType(), True),
    StructField("Drop_DateTime",   StringType(), True),
    StructField("Pickup_Location", StringType(), True),
    StructField("Drop_Location",   StringType(), True),
    StructField("City",            StringType(), True),
    StructField("Fare_Amount",     DoubleType(), True),
    StructField("Payment_Type",    StringType(), True),
])

df = spark.createDataFrame(data, schema)

print("Schema:")
df.printSchema()
df.show(truncate=False)

df = df.withColumn("Pickup_TS", F.to_timestamp("Pickup_DateTime")) \
       .withColumn("Drop_TS", F.to_timestamp("Drop_DateTime")) \
       .withColumn("pickup_date", F.to_date("Pickup_DateTime")) \
       .withColumn("pickup_hour", F.hour("Pickup_DateTime")) \
       .filter(F.col("Fare_Amount") > 0)


print("Top 10 busiest pickup locations:")
df.groupBy("Pickup_Location").agg(F.count("*").alias("trip_count")) \
  .orderBy(F.desc("trip_count")).limit(10).show()

print("Top 10 busiest drop locations:")
df.groupBy("Drop_Location").agg(F.count("*").alias("trip_count")) \
  .orderBy(F.desc("trip_count")).limit(10).show()

print("Total rides per city:")
df.groupBy("City").agg(F.count("*").alias("total_rides")).show()

print("Total revenue per city:")
df.groupBy("City").agg(F.sum("Fare_Amount").alias("total_revenue")).show()

print("Average fare per trip per city:")
df.groupBy("City").agg(F.avg("Fare_Amount").alias("avg_fare")).show()

print("Trips per hour of day:")
df.groupBy("pickup_hour").agg(F.count("*").alias("trips")).orderBy("pickup_hour").show()

morning = df.filter((F.col("pickup_hour") >= 8) & (F.col("pickup_hour") <= 10)).count()
evening = df.filter((F.col("pickup_hour") >= 18) & (F.col("pickup_hour") <= 21)).count()
print(f"Morning (8–10 AM) trips: {morning}")
print(f"Evening (6–9 PM) trips: {evening}")

print("Payment share (Cash vs Online):")
df.groupBy("Payment_Type").agg(F.count("*").alias("count")).show()


Schema:
root
 |-- Trip_ID: string (nullable = true)
 |-- Pickup_DateTime: string (nullable = true)
 |-- Drop_DateTime: string (nullable = true)
 |-- Pickup_Location: string (nullable = true)
 |-- Drop_Location: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Fare_Amount: double (nullable = true)
 |-- Payment_Type: string (nullable = true)

+-------+-------------------+-------------------+---------------+---------------+---------+-----------+------------+
|Trip_ID|Pickup_DateTime    |Drop_DateTime      |Pickup_Location|Drop_Location  |City     |Fare_Amount|Payment_Type|
+-------+-------------------+-------------------+---------------+---------------+---------+-----------+------------+
|1      |2023-09-01 08:15:00|2023-09-01 08:45:00|Koramangala    |Whitefield     |Bangalore|350.0      |Online      |
|2      |2023-09-01 09:10:00|2023-09-01 09:40:00|Andheri        |Bandra         |Mumbai   |220.0      |Cash        |
|3      |2023-09-01 18:30:00|2023-09-01 19:00:00|Indira