# Test exam DataBricks

In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.master("local[*]").appName("jdbc data sources").getOrCreate()

## SparkContext

In [2]:
spark.conf.set("spark.sql.shuffle.partitions", 6)
spark.conf.set("spark.executor.memory", "2g")

In [3]:
print(spark.conf.get("spark.sql.shuffle.partitions"), ",", spark.conf.get("spark.executor.memory"))

6 , 2g


## SparkSession

#### Create a DataFrame/Dataset from a collection (e.g. list or set)

In [6]:
from pyspark.sql.types import IntegerType

list_df = spark.createDataFrame([1, 2, 3, 4], IntegerType())
display(list_df)
list_df.show()

DataFrame[value: int]

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
+-----+



#### Create a DataFrame for a range of numbers

In [9]:
ints_df = spark.range(1000).toDF("number")
display(ints_df)
ints_df.show(2)

DataFrame[number: bigint]

+------+
|number|
+------+
|     0|
|     1|
+------+
only showing top 2 rows



#### Access the DataFrameReaders

In [None]:
df = spark.read.csv('tmp/dataframe_sample.csv', inferSchema=True)
# spark.read.parquet()
# spark.read.json()
# spark.read.format().open()

#### Register User Defined Functions (UDFs)

In [13]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def power3(value):
  return value ** 3

spark.udf.register("power3py", power3, IntegerType())

<function __main__.power3(value)>

In [16]:
power3udf = udf(power3, IntegerType())
power3_ints_df = ints_df.select("number", power3udf("number").alias("power3"))
display(power3_ints_df)
power3_ints_df.show(3)

DataFrame[number: bigint, power3: int]

+------+------+
|number|power3|
+------+------+
|     0|     0|
|     1|     1|
|     2|     8|
+------+------+
only showing top 3 rows



In [21]:
spark.range(1, 20).registerTempTable("test")

# DataFrameReader

#### Read data for the "core" data formats (CSV, JSON, JDBC, ORC, Parquet, text and tables)

In [22]:
data_file = "tmp/dataframe_sample.csv"

df = spark.read.csv(data_file)
display(df)
df.show()

DataFrame[_c0: string]

+--------------------+
|                 _c0|
+--------------------+
|id|end_date|start...|
|1|2015-10-14 00:0...|
|2|2015-10-15 01:0...|
|3|2015-10-16 02:3...|
|4|2015-10-17 03:0...|
|5|2015-10-18 04:3...|
+--------------------+



#### How to configure options for specific formats

In [27]:
df = spark.read.csv(data_file, header=True, inferSchema=True)
display(df)
df.show()

DataFrame[id|end_date|start_date|location: string]

+-------------------------------+
|id|end_date|start_date|location|
+-------------------------------+
|           1|2015-10-14 00:0...|
|           2|2015-10-15 01:0...|
|           3|2015-10-16 02:3...|
|           4|2015-10-17 03:0...|
|           5|2015-10-18 04:3...|
+-------------------------------+



#### How to read data from non-core formats using format() and load()

In [30]:
df = spark.read.format("csv").options(header='true', delimiter = '|').load(data_file)
df.show()

+---+-------------------+-------------------+--------+
| id|           end_date|         start_date|location|
+---+-------------------+-------------------+--------+
|  1|2015-10-14 00:00:00|2015-09-14 00:00:00|   CA-SF|
|  2|2015-10-15 01:00:20|2015-08-14 00:00:00|   CA-SD|
|  3|2015-10-16 02:30:00|2015-01-14 00:00:00|   NY-NY|
|  4|2015-10-17 03:00:20|2015-02-14 00:00:00|   NY-NY|
|  5|2015-10-18 04:30:00|2014-04-14 00:00:00|   CA-SD|
+---+-------------------+-------------------+--------+



#### How to construct and specify a schema using the StructType classes

In [34]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("field1", StringType()),
  StructField("field2", StringType()),
  StructField("field3", StringType())
])

df3 = spark.read.format("csv").schema(myManualSchema).option("header","true")\
.options(delimiter = '|')\
.load(data_file)
df3.show()

+------+-------------------+-------------------+
|field1|             field2|             field3|
+------+-------------------+-------------------+
|     1|2015-10-14 00:00:00|2015-09-14 00:00:00|
|     2|2015-10-15 01:00:20|2015-08-14 00:00:00|
|     3|2015-10-16 02:30:00|2015-01-14 00:00:00|
|     4|2015-10-17 03:00:20|2015-02-14 00:00:00|
|     5|2015-10-18 04:30:00|2014-04-14 00:00:00|
+------+-------------------+-------------------+

