In [1]:
from pyspark.sql import SparkSession
import sys
import os

In [2]:
def create_spark_session():
    spark = SparkSession.builder \
        .appName("Spark version for actors and events") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()

    # Disable automatic broadcast join threshold as required
    spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
    
    return spark

In [7]:
# 1. Initialize Spark
spark = create_spark_session()

# 2. Define Postgres connection info
url = "jdbc:postgresql://localhost:5434/postgres"
properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

In [11]:
spark = SparkSession.builder \
    .appName("NBA Database Connection") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.3") \
    .getOrCreate()

In [8]:
df = spark.read.jdbc(url=url, table="game_details", properties=properties)

# 5. Show data
df.show(5)

25/08/31 18:22:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------+----------+-----------------+---------+---------+----------------+--------+--------------+--------------------+-----+----+----+------+----+----+-------+----+----+------+----+----+----+----+----+----+----+----+----+----------+
| game_id|   team_id|team_abbreviation|team_city|player_id|     player_name|nickname|start_position|             comment|  min| fgm| fga|fg_pct|fg3m|fg3a|fg3_pct| ftm| fta|ft_pct|oreb|dreb| reb| ast| stl| blk|  TO|  pf| pts|plus_minus|
+--------+----------+-----------------+---------+---------+----------------+--------+--------------+--------------------+-----+----+----+------+----+----+-------+----+----+------+----+----+----+----+----+----+----+----+----+----------+
|22200162|1610612737|              ATL|  Atlanta|  1630249|      Vit Krejci|     Vit|          NULL|DNP - Coach's Dec...| NULL|NULL|NULL|  NULL|NULL|NULL|   NULL|NULL|NULL|  NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|      NULL|
|22200162|1610612737|              ATL|  Atlanta|  16312

In [11]:
events_df = spark.read.jdbc(url=url, table="events", properties=properties)

events_df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+---+--------+--------------------+--------------------+--------------------+--------------------+
|url|referrer|             user_id|           device_id|                host|          event_time|
+---+--------+--------------------+--------------------+--------------------+--------------------+
|  /|    NULL|                NULL|99067568206611800...| www.zachwilson.tech|2023-01-09 11:55:...|
|  /|    NULL|16647130749494100...|99067568206611800...| www.zachwilson.tech|2023-01-09 20:10:...|
|  /|    NULL|                NULL|12048389584711700...|admin.zachwilson....|2023-01-10 04:43:...|
|  /|    NULL|                NULL|12048389584711700...| www.zachwilson.tech|2023-01-14 05:25:...|
|  /|    NULL|                NULL|12048389584711700...| www.zachwilson.tech|2023-01-14 05:25:...|
+---+--------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows


                                                                                

In [9]:
jars = spark.sparkContext._jsc.sc().listJars()
print(jars)

List(spark://192.168.1.139:38571/jars/org.checkerframework_checker-qual-3.31.0.jar, spark://192.168.1.139:38571/jars/org.postgresql_postgresql-42.6.0.jar)
