In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("Read GTFS CSVs with Schema from MinIO") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.cores", "8") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "6g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 15:08:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
read_options = {
    "header": True,
    "inferSchema": True
}
agency_df         = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/agency.csv")
calendar_df       = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/calendar.csv")
calendar_dates_df = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/calendar_dates.csv")
routes_df         = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/routes.csv")
shapes_df         = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/shapes.csv")
stops_df          = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/stops.csv")
stop_times_df     = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/stop_times.csv")
trips_df          = spark.read.options(**read_options).csv("s3a://transitbatch/rowdata/*/trips.csv")

25/08/01 15:08:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
dataframes = {
    "agency_df": agency_df,
    "calendar_df": calendar_df,
    "calendar_dates_df": calendar_dates_df,
    "routes_df": routes_df,
    "shapes_df": shapes_df,
    "stops_df": stops_df,
    "stop_times_df": stop_times_df,
    "trips_df": trips_df
}
for name, df in dataframes.items():
    count = df.count()
    print(f"{name}: {count} rows")

agency_df: 5 rows
calendar_df: 80 rows
calendar_dates_df: 823 rows
routes_df: 1395 rows
shapes_df: 283649 rows
stops_df: 11679 rows
stop_times_df: 5095336 rows
trips_df: 147208 rows


In [6]:
agency_df         = agency_df.dropDuplicates()
calendar_df       = calendar_df.dropDuplicates()
calendar_dates_df = calendar_dates_df.dropDuplicates()
routes_df         = routes_df.dropDuplicates()
shapes_df         = shapes_df.dropDuplicates()
stops_df          = stops_df.dropDuplicates()
stop_times_df     = stop_times_df.dropDuplicates()
trips_df          = trips_df.dropDuplicates()


In [7]:
dataframes = {
    "agency_df": agency_df,
    "calendar_df": calendar_df,
    "calendar_dates_df": calendar_dates_df,
    "routes_df": routes_df,
    "shapes_df": shapes_df,
    "stops_df": stops_df,
    "stop_times_df": stop_times_df,
    "trips_df": trips_df
}
for name, df in dataframes.items():
    count = df.count()
    print(f"{name}: {count} rows")

agency_df: 1 rows
calendar_df: 80 rows
calendar_dates_df: 823 rows
routes_df: 320 rows


                                                                                

shapes_df: 283649 rows
stops_df: 11507 rows


                                                                                

stop_times_df: 5095336 rows
trips_df: 147208 rows


In [8]:
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================

In [None]:
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================
#=================================================================================================================

In [9]:
jdbc_url = "jdbc:postgresql://postgres:5432/gtfs_batch"
connection_properties = {
    "user": "admin",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

In [10]:
agency_df.write.jdbc(url=jdbc_url, table="agency", mode="overwrite", properties=connection_properties)
calendar_df.write.jdbc(url=jdbc_url, table="calendar", mode="overwrite", properties=connection_properties)
calendar_dates_df.write.jdbc(url=jdbc_url, table="calendar_dates", mode="overwrite", properties=connection_properties)
routes_df.write.jdbc(url=jdbc_url, table="routes", mode="overwrite", properties=connection_properties)
shapes_df.write.jdbc(url=jdbc_url, table="shapes", mode="overwrite", properties=connection_properties)
stops_df.write.jdbc(url=jdbc_url, table="stops", mode="overwrite", properties=connection_properties)
trips_df.write.jdbc(url=jdbc_url, table="trips", mode="overwrite", properties=connection_properties)

                                                                                

In [11]:
stop_times_df.coalesce(1).write.jdbc(url=jdbc_url, table="stop_times", mode="overwrite", properties=connection_properties)

                                                                                

In [12]:
import gc
for df in [
    agency_df, calendar_df, calendar_dates_df, routes_df,
    shapes_df, stops_df, stop_times_df, trips_df
]:
    df.unpersist(blocking=True)

# مسح الـ cache من Spark
spark.catalog.clearCache()

# إيقاف Spark
spark.stop()

# حذف المتغيرات من الذاكرة
del agency_df, calendar_df, calendar_dates_df, routes_df
del shapes_df, stops_df, stop_times_df, trips_df, spark

# فرض جمع المهملات
gc.collect()

141