In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType


In [2]:

spark = SparkSession.builder\
    .master('spark://192.168.2.102:7077')\
    .appName('SQL Examples')\
    .config("spark.cores.max", "4")\
    .config("spark.executor.memory", "4g")\
    .getOrCreate()


21/12/22 17:08:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/22 17:08:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/12/22 17:08:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# SQL
To use sql, you will need to create or replace a temporary view

In [3]:
beer_schema = StructType([
    StructField('Id', IntegerType()),
    StructField('InitialDate', DateType()),
    StructField('Type', StringType()),
])

temp_hist_schema = StructType([
    StructField('Date', DateType()),
    StructField('Id', IntegerType()),
    StructField('Temp', FloatType()),
])

beer_df = spark.read.csv('data/beer.csv', schema=beer_schema, header=True)
beer_df.createOrReplaceTempView('Beer')
beer_df.show(5)

temp_hist_df = spark.read.csv('data/beer_temp_hist.txt', sep=';', schema=temp_hist_schema)
temp_hist_df.createOrReplaceTempView('TempHist')
temp_hist_df.show(5)

                                                                                

+---+-----------+--------+
| Id|InitialDate|    Type|
+---+-----------+--------+
|  1| 2021-12-01|   Laget|
|  2| 2021-12-01|Pale Ale|
|  3| 2021-12-01|    null|
|  4| 2021-12-01|     Ipa|
+---+-----------+--------+

+----------+---+----+
|      Date| Id|Temp|
+----------+---+----+
|2021-12-01|  1|20.0|
|2021-12-02|  1|20.2|
|2021-12-03|  1|null|
|2021-12-04|  1|20.3|
|2021-12-05|  1|20.5|
+----------+---+----+
only showing top 5 rows



## Join tables

In [4]:
beer_hist_df = temp_hist_df.join(beer_df, on='Id')
print(beer_hist_df.show(5))

beer_hist_df = spark.sql("""
SELECT 
    TempHist.Id,
    TempHist.Date,
    TempHist.Temp,
    Beer.InitialDate,
    Beer.Type
FROM TempHist
INNER JOIN Beer ON 
    Beer.Id = TempHist.Id
""")

beer_hist_df.show(5)

beer_hist_df.createOrReplaceTempView('BeerHist')


                                                                                

+---+----------+----+-----------+-----+
| Id|      Date|Temp|InitialDate| Type|
+---+----------+----+-----------+-----+
|  1|2021-12-01|20.0| 2021-12-01|Laget|
|  1|2021-12-02|20.2| 2021-12-01|Laget|
|  1|2021-12-03|null| 2021-12-01|Laget|
|  1|2021-12-04|20.3| 2021-12-01|Laget|
|  1|2021-12-05|20.5| 2021-12-01|Laget|
+---+----------+----+-----------+-----+
only showing top 5 rows

None
+---+----------+----+-----------+-----+
| Id|      Date|Temp|InitialDate| Type|
+---+----------+----+-----------+-----+
|  1|2021-12-01|20.0| 2021-12-01|Laget|
|  1|2021-12-02|20.2| 2021-12-01|Laget|
|  1|2021-12-03|null| 2021-12-01|Laget|
|  1|2021-12-04|20.3| 2021-12-01|Laget|
|  1|2021-12-05|20.5| 2021-12-01|Laget|
+---+----------+----+-----------+-----+
only showing top 5 rows



In [5]:
beer_df = spark.sql('SELECT * FROM BeerHist')
beer_df.show()

+---+----------+----+-----------+--------+
| Id|      Date|Temp|InitialDate|    Type|
+---+----------+----+-----------+--------+
|  1|2021-12-01|20.0| 2021-12-01|   Laget|
|  1|2021-12-02|20.2| 2021-12-01|   Laget|
|  1|2021-12-03|null| 2021-12-01|   Laget|
|  1|2021-12-04|20.3| 2021-12-01|   Laget|
|  1|2021-12-05|20.5| 2021-12-01|   Laget|
|  2|2021-12-01|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-02|16.4| 2021-12-01|Pale Ale|
|  2|2021-12-03|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-04|null| 2021-12-01|Pale Ale|
|  2|2021-12-05|16.8| 2021-12-01|Pale Ale|
|  2|2021-12-05|16.7| 2021-12-01|Pale Ale|
|  3|2021-12-01|18.3| 2021-12-01|    null|
|  3|2021-12-02|18.4| 2021-12-01|    null|
|  3|2021-12-03|null| 2021-12-01|    null|
|  4|2021-12-01|18.2| 2021-12-01|     Ipa|
+---+----------+----+-----------+--------+



In [6]:
# Equivalent
bear_id = beer_hist_df.select('Id')
bear_id.show(2)

bear_id = spark.sql('SELECT Id FROM BeerHist')
bear_id.show(2)

                                                                                

+---+
| Id|
+---+
|  1|
|  1|
+---+
only showing top 2 rows

+---+
| Id|
+---+
|  1|
|  1|
+---+
only showing top 2 rows



In [7]:
# Equivalent
beer_2 = beer_hist_df.filter(beer_hist_df.Id == 2).orderBy('Temp', ascending=False)
beer_2.show()

beer_2 = spark.sql('SELECT * FROM BeerHist WHERE Id = 2 order by Temp desc')
beer_2.show()

                                                                                

+---+----------+----+-----------+--------+
| Id|      Date|Temp|InitialDate|    Type|
+---+----------+----+-----------+--------+
|  2|2021-12-05|16.8| 2021-12-01|Pale Ale|
|  2|2021-12-05|16.7| 2021-12-01|Pale Ale|
|  2|2021-12-01|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-03|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-02|16.4| 2021-12-01|Pale Ale|
|  2|2021-12-04|null| 2021-12-01|Pale Ale|
+---+----------+----+-----------+--------+

+---+----------+----+-----------+--------+
| Id|      Date|Temp|InitialDate|    Type|
+---+----------+----+-----------+--------+
|  2|2021-12-05|16.8| 2021-12-01|Pale Ale|
|  2|2021-12-05|16.7| 2021-12-01|Pale Ale|
|  2|2021-12-01|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-03|16.5| 2021-12-01|Pale Ale|
|  2|2021-12-02|16.4| 2021-12-01|Pale Ale|
|  2|2021-12-04|null| 2021-12-01|Pale Ale|
+---+----------+----+-----------+--------+



In [8]:
# Equivalent
beer_hist_df.groupBy('Id').agg(
    F.min('Temp'), F.max('Temp')
).show()

spark.sql('SELECT Id, MIN(Temp), max(Temp) FROM BeerHist GROUP BY id').show()


                                                                                

+---+---------+---------+
| Id|min(Temp)|max(Temp)|
+---+---------+---------+
|  1|     20.0|     20.5|
|  3|     18.3|     18.4|
|  4|     18.2|     18.2|
|  2|     16.4|     16.8|
+---+---------+---------+

+---+---------+---------+
| Id|min(Temp)|max(Temp)|
+---+---------+---------+
|  1|     20.0|     20.5|
|  3|     18.3|     18.4|
|  4|     18.2|     18.2|
|  2|     16.4|     16.8|
+---+---------+---------+

