In [None]:
https://www.youtube.com/watch?v=EjzhMv0E_FE&list=PLBTZqjSKn0IeKBQDjLmzisazhqQy4iGkb&index=7

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for Trips table
trips_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("client_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("city_id", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("request_at", StringType(), True)
])

# Define schema for Users table
users_schema = StructType([
    StructField("users_id", IntegerType(), True),
    StructField("banned", StringType(), True),
    StructField("role", StringType(), True)
])

# Create Trips DataFrame
trips_data = [
    (1, 1, 10, 1, "completed", "2013-10-01"),
    (2, 2, 11, 1, "cancelled_by_driver", "2013-10-01"),
    (3, 3, 12, 6, "completed", "2013-10-01"),
    (4, 4, 13, 6, "cancelled_by_client", "2013-10-01"),
    (5, 1, 10, 1, "completed", "2013-10-02"),
    (6, 2, 11, 6, "completed", "2013-10-02"),
    (7, 3, 12, 6, "completed", "2013-10-02"),
    (8, 2, 12, 12, "completed", "2013-10-03"),
    (9, 3, 10, 12, "completed", "2013-10-03"),
    (10, 4, 13, 12, "cancelled_by_driver", "2013-10-03")
]

trips_df = spark.createDataFrame(trips_data, schema=trips_schema)

# Create Users DataFrame
users_data = [
    (1, "No", "client"),
    (2, "Yes", "client"),
    (3, "No", "client"),
    (4, "No", "client"),
    (10, "No", "driver"),
    (11, "No", "driver"),
    (12, "No", "driver"),
    (13, "No", "driver")
]

users_df = spark.createDataFrame(users_data, schema=users_schema)

# Create temporary views for SQL queries
trips_df.createOrReplaceTempView("Trips")
users_df.createOrReplaceTempView("Users")



print("Trips and Users tables and views created successfully.")


Trips and Users tables and views created successfully.


In [20]:
spark.sql("""
    select 
    request_at, 
    SUM(case when status in ("cancelled_by_client", "cancelled_by_driver") then 1 else 0 end) as cancelled_trip_count,
    count(1) as total_trips
    from Trips t 
    inner join Users c on t.client_id = c.users_id
    inner join Users d on t.driver_id = d.users_id
    where c.banned = "No" and d.banned = "No"
    group by request_at
""").show()

+----------+--------------------+-----------+
|request_at|cancelled_trip_count|total_trips|
+----------+--------------------+-----------+
|2013-10-03|                   0|          2|
|2013-10-01|                   1|          3|
|2013-10-02|                   0|          2|
+----------+--------------------+-----------+



In [25]:
cancelled_df = trips_df.alias("t").join(users_df.alias("c"), col("t.client_id") == col("c.users_id"), "inner") \
                  .join(users_df.alias("d"), col("t.driver_id") == col("d.users_id"), "inner") \
                 .filter((col("c.banned") == "No") & (col("d.banned") == "No"))

In [32]:
cancelled_df.groupBy(col("request_at")).agg(
        sum(when(col("status").isin(["cancelled_by_client", "cancelled_by_driver"]), 1).otherwise(0)).alias("total"),
        count("*").alias("total_trips")
) \
.orderBy(col("request_at")).show()

+----------+-----+-----------+
|request_at|total|total_trips|
+----------+-----+-----------+
|2013-10-01|    1|          3|
|2013-10-02|    0|          2|
|2013-10-03|    1|          2|
+----------+-----+-----------+

