In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date


In [2]:
# ---------- Spark session ----------

spark = SparkSession.builder \
    .appName("Read from PostgreSQL") \
    .config(
        "spark.jars.packages",
        "org.postgresql:postgresql:42.7.7,com.clickhouse:clickhouse-jdbc:0.4.6"
    ) \
    .config("spark.executor.cores", "8") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

In [3]:
# Postgres Config Connection

jdbc_url_pg = "jdbc:postgresql://postgres:5432/gtfs_batch"
pg_properties = {
    "user": "admin",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

In [4]:
# Read tables and store them in DataFrames with filtering is_current = true

agency_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM agency WHERE is_current = true) AS agency",
    properties=pg_properties
)

calendar_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM calendar WHERE is_current = true) AS calendar",
    properties=pg_properties
)

calendar_dates_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM calendar_dates WHERE is_current = true) AS calendar_dates",
    properties=pg_properties
)

routes_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM routes WHERE is_current = true) AS routes",
    properties=pg_properties
)

shapes_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM shapes WHERE is_current = true) AS shapes",
    properties=pg_properties
)

stops_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM stops WHERE is_current = true) AS stops",
    properties=pg_properties
)

stop_times_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM stop_times WHERE is_current = true) AS stop_times",
    properties=pg_properties
)

trips_df = spark.read.jdbc(
    url=jdbc_url_pg,
    table="(SELECT * FROM trips WHERE is_current = true) AS trips",
    properties=pg_properties
)


# agency_df         = spark.read.jdbc(url=jdbc_url_pg, table="agency", properties=pg_properties)
# calendar_df       = spark.read.jdbc(url=jdbc_url_pg, table="calendar", properties=pg_properties)
# calendar_dates_df = spark.read.jdbc(url=jdbc_url_pg, table="calendar_dates", properties=pg_properties)
# routes_df         = spark.read.jdbc(url=jdbc_url_pg, table="routes", properties=pg_properties)
# shapes_df         = spark.read.jdbc(url=jdbc_url_pg, table="shapes", properties=pg_properties)
# stops_df          = spark.read.jdbc(url=jdbc_url_pg, table="stops", properties=pg_properties)
# stop_times_df     = spark.read.jdbc(url=jdbc_url_pg, table="stop_times", properties=pg_properties)
# trips_df          = spark.read.jdbc(url=jdbc_url_pg, table="trips", properties=pg_properties)

In [None]:
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================

In [None]:
from pyspark.sql.functions import date_format, current_date, col, dayofweek, lit

# 1. Today's date in yyyyMMdd (integer) format
today_str = spark.createDataFrame([(1,)], ["dummy"]) \
    .select(current_date().alias("today")) \
    .collect()[0]["today"]

# today_int = int(
#     spark.createDataFrame([(1,)], ["dummy"])
#          .select(date_format(current_date(), "yyyyMMdd").alias("today"))
#          .collect()[0]["today"]
# )

# 2. Day of the week number (Spark: Sunday=1, Saturday=7)
today_dow = spark.createDataFrame([(1,)], ["dummy"]) \
    .select(dayofweek(current_date()).alias("dow")) \
    .collect()[0]["dow"]

# 3. Filter calendar_df for current day (in range + weekday enabled)
days_map = {1: "sunday", 2: "monday", 3: "tuesday", 4: "wednesday",
            5: "thursday", 6: "friday", 7: "saturday"}

calendar_today_df = calendar_df.filter(
    (col("start_date") <= lit(today_str).cast("date")) &
    (col("end_date") >= lit(today_str).cast("date")) &
    (col(days_map[today_dow]) == 1)
)

# calendar_today_df = calendar_df.filter(
#     (col("start_date") <= today_int) &
#     (col("end_date") >= today_int) &
#     (col(days_map[today_dow]) == 1)
# )

# 4. Filter calendar_dates_df to the current day
calendar_dates_today_df = calendar_dates_df.filter(col("date") == lit(today_str).cast("date"))

# 5. Merge the service_id from the two (calendar and calendar_dates)
service_today_df = calendar_today_df.select("service_id") \
    .union(calendar_dates_today_df.select("service_id")) \
    .distinct()

# 6. Joining with trips
trips_today_df = trips_df.join(service_today_df, "service_id", "inner").select(trips_df["*"])

# 7. Remaining DataFrames for the day (with original columns only)
stop_times_today_df = stop_times_df.join(trips_today_df, "trip_id", "inner").select(stop_times_df["*"]).distinct()
routes_today_df = routes_df.join(trips_today_df, "route_id", "inner").select(routes_df["*"]).distinct()
shapes_today_df = shapes_df.join(trips_today_df, "shape_id", "inner").select(shapes_df["*"]).distinct()
agency_today_df = agency_df.join(routes_today_df, "agency_id", "inner").select(agency_df["*"]).distinct()
stops_today_df = stops_df.join(stop_times_today_df, "stop_id", "inner").select(stops_df["*"]).distinct()

# 8. Display results
print("==========================================================================")
agency_today_df.show(3, truncate=False)
print("==========================================================================")
calendar_today_df.show(3, truncate=False)
print("==========================================================================")
calendar_dates_today_df.show(3, truncate=False)
print("==========================================================================")
routes_today_df.show(3, truncate=False)
print("==========================================================================")
shapes_today_df.show(3, truncate=False)
print("==========================================================================")
stops_today_df.show(3, truncate=False)
print("==========================================================================")
stop_times_today_df.show(3, truncate=False)
print("==========================================================================")
trips_today_df.show(3, truncate=False)
print("==========================================================================")


In [None]:
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================

In [None]:
# ClichHouse Config Connection

clickhouse_url = "jdbc:clickhouse://clickhouse:8123/gtfs_batch"
clickhouse_properties = {
    "user": "default",
    "password": "123",
    "driver": "com.clickhouse.jdbc.ClickHouseDriver"
}

In [None]:
# calendar_today_df = calendar_today_df.withColumn("start_date", to_date("start_date", "yyyyMMdd"))
# calendar_today_df = calendar_today_df.withColumn("end_date", to_date("end_date", "yyyyMMdd"))
# calendar_dates_today_df = calendar_dates_today_df.withColumn("date", to_date("date", "yyyyMMdd"))

In [None]:
agency_today_df.write.jdbc(clickhouse_url, "agency", mode="append", properties=clickhouse_properties)
calendar_today_df.repartition(50).write.jdbc(clickhouse_url, "calendar", mode="append", properties=clickhouse_properties)
calendar_dates_today_df.repartition(50).write.jdbc(clickhouse_url, "calendar_dates", mode="append", properties=clickhouse_properties)
routes_today_df.repartition(50).write.jdbc(clickhouse_url, "routes", mode="append", properties=clickhouse_properties)
shapes_today_df.repartition(50).write.jdbc(clickhouse_url, "shapes", mode="append", properties=clickhouse_properties)
stops_today_df.repartition(50).write.jdbc(clickhouse_url, "stops", mode="append", properties=clickhouse_properties)
trips_today_df.repartition(50).write.jdbc(clickhouse_url, "trips", mode="append", properties=clickhouse_properties)

                                                                                

In [None]:
stop_times_today_df.repartition(50).write.jdbc(clickhouse_url, "stop_times", mode="append", properties=clickhouse_properties)

[Stage 7:>                                                          (0 + 1) / 1]

In [6]:
import gc
for df in [
    agency_df, calendar_df, calendar_dates_df, routes_df,
    shapes_df, stops_df, stop_times_df, trips_df,
    agency_today_df, calendar_today_df, calendar_dates_today_df, routes_today_df,
    shapes_today_df, stops_today_df, stop_times_today_df, trips_today_df
]:
    df.unpersist(blocking=True)

# Clear cache from Spark
spark.catalog.clearCache()

# Stop Spark
spark.stop()

# Deleting variables from memory
del agency_df, calendar_df, calendar_dates_df, routes_df
del shapes_df, stops_df, stop_times_df, trips_df, spark
del agency_today_df, calendar_today_df, calendar_dates_today_df, routes_today_df,
del shapes_today_df, stops_today_df, stop_times_today_df, trips_today_df

# Garbage collection imposed
gc.collect()

2390