In [9]:
import findspark
import configparser

In [10]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [11]:
from pyspark.sql import SparkSession, functions as F

In [12]:
! ls -l $SPARK_HOME/jars | grep postgresql

-rw-rw-r--. 1 train train   932808 Aug 14  2021 postgresql-42.2.14.jar


In [13]:
# If you want to connect a rdbms, related driver should be in the classpath

In [14]:
spark = SparkSession.builder \
.appName("JDBC and SQL") \
.master("local[2]") \
.getOrCreate()

# Read From Postgresql

In [15]:
# Check postgresql
# psql -d traindb -U train -W

### db_conn file

    [DB]
    user_name = train
    password = Ankara06
    db_ip = 127.0.0.1


In [17]:
config = configparser.RawConfigParser()

config.read('./db_conn')
user_name = config.get('DB', 'user_name')
password = config.get('DB', 'password')
db_ip = config.get('DB', 'db_ip')

## Method-1

In [18]:
jdbcUrl = f"jdbc:postgresql://{db_ip}:5432/traindb?user={user_name}&password={password}"

In [19]:
df = (
spark.read
.format("jdbc")
.option("url", jdbcUrl)
.option("driver", 'org.postgresql.Driver')
.option("query","select * from books")
.load()
     )

In [20]:
df.limit(5).toPandas()

                                                                                

Unnamed: 0,id,book_name,isbn,book_id,price,price_currency,rating_count,author_id,publisher_id
0,13,Madam Bovary (Ciltli),6050948752,489127179,25.115735,TRY,5,4098249,46868
1,22,Mai ve Siyah (Eleştirel Basım),9750523533,492625951,25.34,TRY,17,3066057,63217
2,27,Nutuk,9759914288,9927355,11.48147,TRY,23,9705003,46868
3,34,Devlet,9754734263,395307782,27.9994,TRY,0,8978000,20709


In [21]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_name: string (nullable = true)
 |-- isbn: long (nullable = true)
 |-- book_id: long (nullable = true)
 |-- price: double (nullable = true)
 |-- price_currency: string (nullable = true)
 |-- rating_count: integer (nullable = true)
 |-- author_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)



## Method-2

In [22]:
df2 = (spark.read.format("jdbc")
.option("driver", "org.postgresql.Driver")
.option("url", f"jdbc:postgresql://{db_ip}:5432/traindb")
.option("dbtable", "books")
.option("user", user_name)
.option("password", password)
.load())

In [23]:
df2.limit(5).toPandas()

Unnamed: 0,id,book_name,isbn,book_id,price,price_currency,rating_count,author_id,publisher_id
0,13,Madam Bovary (Ciltli),6050948752,489127179,25.115735,TRY,5,4098249,46868
1,22,Mai ve Siyah (Eleştirel Basım),9750523533,492625951,25.34,TRY,17,3066057,63217
2,27,Nutuk,9759914288,9927355,11.48147,TRY,23,9705003,46868
3,34,Devlet,9754734263,395307782,27.9994,TRY,0,8978000,20709


# Write to Postgresql

## Method-1

In [24]:
df.write.jdbc(url=jdbcUrl,
              table="books_spark2", 
              mode="overwrite", 
              properties={"driver": 'org.postgresql.Driver'})

## Method-2

In [25]:
(df.write
.format("jdbc")
.mode("overwrite")
.option("driver", "org.postgresql.Driver")
.option("url", f"jdbc:postgresql://{db_ip}:5432/traindb")
.option("dbtable", "books_spark")
.option("user", user_name)
.option("password", password)
.save())

# Check write result

In [26]:
(spark.read
.format("jdbc")
.option("url", jdbcUrl)
.option("driver", 'org.postgresql.Driver')
.option("query","select * from books_spark")
.load()).limit(5).toPandas()

Unnamed: 0,id,book_name,isbn,book_id,price,price_currency,rating_count,author_id,publisher_id
0,13,Madam Bovary (Ciltli),6050948752,489127179,25.115735,TRY,5,4098249,46868
1,22,Mai ve Siyah (Eleştirel Basım),9750523533,492625951,25.34,TRY,17,3066057,63217
2,27,Nutuk,9759914288,9927355,11.48147,TRY,23,9705003,46868
3,34,Devlet,9754734263,395307782,27.9994,TRY,0,8978000,20709


In [27]:
spark.stop()