In [1]:
import pyspark
from pyspark.sql import SparkSession
import ConnectionConfig as cc

### Config stuff

### Connection properties
ConnectionConfig.py (cc) is created and imported to simplify the database connection process.
Consult the file to get more insights.

### Session setup
"spark.driver.extraClassPath" is added. This is needed to include the necessary jars when running the sparkJobs.

In [2]:
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("DBConnectionTest") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.extraClassPath", "./jars/mssql-jdbc-10.2.1.jre8.jar") \
    .master("local[4]")
#This one must be added to be able to query a jdbc source
builder = configure_spark_with_delta_pip(builder)
spark = builder.getOrCreate()
builder.getOrCreate()

### Reading a JDBC table
Read a table from sqlServer connection

#### Using the ConnectionConfig (cc) to make things easy
cc can make a connection url based on a connection profile in config.ini. To do this, first set the name of the connection.

#### Partitioning
As Spark is build to work in parallel reading from the database can also be done in parallel. In this case we define 4 partitions. Spark has to know how to split the data for every partition. Therefore you have to provide a partition column and a lower and upperbound. In this case the  on of the 4 queries that Spark will fire looks like "select * from dbo.sales where Order_ID <= 500 and Order_id > 250"

In [4]:
cc.set_connection("mydb")

sales_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "dbo.sales") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "Order_ID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()
sales_df.show(1000)

+-------------------+--------------------+----------+----------+--------+
|         Order_Date|       Customer_Name|SalesRepId|    Amount|Order_ID|
+-------------------+--------------------+----------+----------+--------+
|2011-05-17 00:00:00|          Beth Paige|         2| 778297706|      57|
|2009-11-17 00:00:00|     Sylvia Foulston|         2|2100696115|      58|
|2011-10-10 00:00:00|         Bryan Davis|         2| 642114638|      59|
|2012-10-07 00:00:00|            Joy Bell|         2|1440206513|      60|
|2011-08-13 00:00:00|         Alan Barnes|         2|1244596895|      61|
|2011-08-16 00:00:00|       Grant Carroll|         2|1662557955|      62|
|2009-03-04 00:00:00|   Delfina Latchford|         2| 844183988|      63|
|2009-08-18 00:00:00|           Don Jones|         2| 681975050|      64|
|2010-11-16 00:00:00|       Doug Bickford|         2|1504576144|      65|
|2011-05-27 00:00:00|       Doug Bickford|         2| 622992265|      66|
|2010-08-23 00:00:00|        Jamie Kun

In [5]:
spark.stop()