# Spark DataFrame
* Tabular data
  * Rows 
  * Named Columns
* Immutable 
* Distribute collection of data
* Lazy
* Can process structured and semi-structured data
  * relational database
  * csv
  * json
  * txt
  * RDD
  * dict
  * list
  * etc
* Support SQL or expression methods
  * SELECT * FROM RedWine
  * red_wine_df.select()
* Schema
  * Information about
    * column name
    * data type
    * empty values
    * etc
  * Help to optimize the queries

In [1]:
from pyspark import SparkContext, SparkConf

from pyspark.sql import SparkSession

master = 'spark://192.168.2.102:7077' # Connect to remote server
appName = 'Create DataFrame'

conf = SparkConf()\
    .setMaster(master)\
    .setAppName(appName)\
    .set("spark.executor.memory", "2g")\
    .set("spark.cores.max", "4")\

# RDD
sc = SparkContext(conf=conf)

# DataFrame
spark = SparkSession.builder.getOrCreate()

21/12/22 15:00:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/22 15:00:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df = spark.createDataFrame([
    {'Id': 1, 'Value': 1},
    {'Id': 1, 'Value': 2},
    {'Id': 2, 'Value': 3},
    {'Id': 2, 'Value': 4},
])
df.show()

                                                                                

+---+-----+
| Id|Value|
+---+-----+
|  1|    1|
|  1|    2|
|  2|    3|
|  2|    4|
+---+-----+



In [3]:
df = spark.createDataFrame([
    [1,1],
    [1,2],
    [2,3],
    [2,4],
], schema=['Id', 'Value'])
df.show()

+---+-----+
| Id|Value|
+---+-----+
|  1|    1|
|  1|    2|
|  2|    3|
|  2|    4|
+---+-----+




### Create DataFrame from RDD

In [4]:
red_wine_rdd = sc.parallelize([
    [7.4, 0.7, 0.0, 5],
    [7.8, 0.88, 0.0, 5],
    [7.8, 0.76, 0.04, 5],
    [11.2, 0.28, 0.56, 6],
])

red_wine_rdd.take(5)

[[7.4, 0.7, 0.0, 5],
 [7.8, 0.88, 0.0, 5],
 [7.8, 0.76, 0.04, 5],
 [11.2, 0.28, 0.56, 6]]

In [5]:
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'quality']

red_wine_df = spark.createDataFrame(red_wine_rdd, schema=columns)

red_wine_df.show()

+-------------+----------------+-----------+-------+
|fixed acidity|volatile acidity|citric acid|quality|
+-------------+----------------+-----------+-------+
|          7.4|             0.7|        0.0|      5|
|          7.8|            0.88|        0.0|      5|
|          7.8|            0.76|       0.04|      5|
|         11.2|            0.28|       0.56|      6|
+-------------+----------------+-----------+-------+



In [6]:
red_wine_df.dtypes

[('fixed acidity', 'double'),
 ('volatile acidity', 'double'),
 ('citric acid', 'double'),
 ('quality', 'bigint')]

### Create DataFrame from csv

In [7]:
root_path = 'hdfs://192.168.2.102:9000/dataset/{filename}'

# By Default inferSchema is False
red_wine_df = spark.read.csv(root_path.format(filename='winequality-red.csv'), header=True, inferSchema=False)

red_wine_df.show(5)

In [None]:
red_wine_df.dtypes

[('fixed acidity', 'double'),
 ('volatile acidity', 'double'),
 ('citric acid', 'double'),
 ('residual sugar', 'double'),
 ('chlorides', 'double'),
 ('free sulfur dioxide', 'double'),
 ('total sulfur dioxide', 'double'),
 ('density', 'double'),
 ('pH', 'double'),
 ('sulphates', 'double'),
 ('alcohol', 'double'),
 ('quality', 'int')]

### Create DataFrame from txt

In [None]:
temp_hist = spark.read.text('data/beer_temp_hist.txt')
temp_hist.show(5)

+-----------------+
|            value|
+-----------------+
|2021-12-01;1;20.0|
|2021-12-02;1;20.2|
|    2021-12-03;1;|
|2021-12-04;1;20.3|
|2021-12-05;1;20.5|
+-----------------+
only showing top 5 rows



In [None]:
rdd = sc.textFile('data/beer_temp_hist.txt')
rdd.take(5)

['2021-12-01;1;20.0',
 '2021-12-02;1;20.2',
 '2021-12-03;1;',
 '2021-12-04;1;20.3',
 '2021-12-05;1;20.5']

In [None]:
splitted_rows = rdd.map(lambda row: row.split(';'))
temp_hist = spark.createDataFrame(splitted_rows, schema=['Date', 'BeerId', 'Temp'])
temp_hist.show()

+----------+------+----+
|      Date|BeerId|Temp|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|    |
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
|2021-12-01|     2|16.5|
|2021-12-02|     2|16.4|
|2021-12-03|     2|16.5|
|2021-12-04|     2|    |
|2021-12-05|     2|16.8|
|2021-12-05|     2|16.7|
|2021-12-01|     3|18.3|
|2021-12-02|     3|18.4|
|2021-12-03|     3|    |
|2021-12-01|     4|18.2|
+----------+------+----+



In [None]:
temp_hist.dtypes

[('Date', 'string'), ('BeerId', 'string'), ('Temp', 'string')]

### Infer data type

In [None]:
red_wine_df = spark.read.csv(root_path.format(filename='winequality-red.csv'), header=True, inferSchema=True)
red_wine_df.dtypes

[('fixed acidity', 'double'),
 ('volatile acidity', 'double'),
 ('citric acid', 'double'),
 ('residual sugar', 'double'),
 ('chlorides', 'double'),
 ('free sulfur dioxide', 'double'),
 ('total sulfur dioxide', 'double'),
 ('density', 'double'),
 ('pH', 'double'),
 ('sulphates', 'double'),
 ('alcohol', 'double'),
 ('quality', 'int')]

### Schema

In [None]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType

In [None]:
temp_hist_schema = StructType([
    StructField('Date', DateType()),
    StructField('BeerId', IntegerType()),
    StructField('Temp', FloatType()),
])

In [None]:

temp_hist_df = spark.read.csv('data/beer_temp_hist.txt', sep=';', schema=temp_hist_schema)
temp_hist_df.show()

+----------+------+----+
|      Date|BeerId|Temp|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|null|
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
|2021-12-01|     2|16.5|
|2021-12-02|     2|16.4|
|2021-12-03|     2|16.5|
|2021-12-04|     2|null|
|2021-12-05|     2|16.8|
|2021-12-05|     2|16.7|
|2021-12-01|     3|18.3|
|2021-12-02|     3|18.4|
|2021-12-03|     3|null|
|2021-12-01|     4|18.2|
+----------+------+----+



In [None]:
temp_hist_df.dtypes

[('Date', 'date'), ('BeerId', 'int'), ('Temp', 'float')]

In [None]:
temp_hist_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- BeerId: integer (nullable = true)
 |-- Temp: float (nullable = true)



<hr/>



#### Select and filter

In [None]:
bear_id = beer_df.select('Id')
bear_id.show(2)

+---+
| Id|
+---+
|  1|
|  1|
+---+
only showing top 2 rows



In [None]:
beer_2 = beer_df.filter(beer_df.Id == 2)
beer_2.show()

+---+-----------+--------+----------+----+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+------------------+-----------+--------+
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|          3|   false|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|          2|   false|
|  2| 2021-12-01|Pale Ale|2021-12-02|16.4| 61.51999931335449|          1|   false|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|              61.7|          0|    true|
+---+-----------+--------+----------+----+------------------+-----------+--------+



### Fill Missing values

In [None]:
beer_df.show()

+---+-----------+--------+----------+----+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+------------------+-----------+--------+
|  1| 2021-12-01|   Laget|2021-12-05|20.5|              68.9|          4|   false|
|  1| 2021-12-01|   Laget|2021-12-04|20.3| 68.53999862670898|          3|   false|
|  1| 2021-12-01|   Laget|2021-12-03|null|              null|          2|   false|
|  1| 2021-12-01|   Laget|2021-12-02|20.2| 68.36000137329103|          1|   false|
|  1| 2021-12-01|   Laget|2021-12-01|20.0|              68.0|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|          3|   false|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|          2|   false|
|  2

In [None]:
beer_df = beer_df.fillna('Unknown', subset=['Type'])
beer_df.show()

+---+-----------+--------+----------+----+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+------------------+-----------+--------+
|  1| 2021-12-01|   Laget|2021-12-05|20.5|              68.9|          4|   false|
|  1| 2021-12-01|   Laget|2021-12-04|20.3| 68.53999862670898|          3|   false|
|  1| 2021-12-01|   Laget|2021-12-03|null|              null|          2|   false|
|  1| 2021-12-01|   Laget|2021-12-02|20.2| 68.36000137329103|          1|   false|
|  1| 2021-12-01|   Laget|2021-12-01|20.0|              68.0|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|          3|   false|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|          2|   false|
|  2

In [None]:
mean = beer_df.agg(F.mean('C')).collect()[0].asDict()['avg(C)']
print(mean)
beer_df.fillna(mean, 'C').show()

                                                                                

18.233333269755047
+---+-----------+--------+----------+---------+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|        C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+---------+------------------+-----------+--------+
|  1| 2021-12-01|   Laget|2021-12-05|     20.5|              68.9|          4|   false|
|  1| 2021-12-01|   Laget|2021-12-04|     20.3| 68.53999862670898|          3|   false|
|  1| 2021-12-01|   Laget|2021-12-03|18.233334|              null|          2|   false|
|  1| 2021-12-01|   Laget|2021-12-02|     20.2| 68.36000137329103|          1|   false|
|  1| 2021-12-01|   Laget|2021-12-01|     20.0|              68.0|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-05|     16.7|62.060001373291016|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-05|     16.8| 62.23999862670898|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-04|18.233334|              null|          3|   false|
|  2| 2021-12

In [None]:
from pyspark.sql.window import Window

window = Window\
    .partitionBy('Id')\
    .orderBy('Date')\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

filled_cols = F.last(beer_df.F, ignorenulls=True).over(window)
beer_df = beer_df.withColumn('F[Filled]', filled_cols)

beer_df.select(['Id', 'Date', 'F', 'F[Filled]']).orderBy(['Id', 'Date']).show()



+---+----------+------------------+------------------+
| Id|      Date|                 F|         F[Filled]|
+---+----------+------------------+------------------+
|  1|2021-12-01|              68.0|              68.0|
|  1|2021-12-02| 68.36000137329103| 68.36000137329103|
|  1|2021-12-03|              null| 68.36000137329103|
|  1|2021-12-04| 68.53999862670898| 68.53999862670898|
|  1|2021-12-05|              68.9|              68.9|
|  2|2021-12-01|              61.7|              61.7|
|  2|2021-12-02| 61.51999931335449| 61.51999931335449|
|  2|2021-12-03|              61.7|              61.7|
|  2|2021-12-04|              null|              61.7|
|  2|2021-12-05|62.060001373291016|62.060001373291016|
|  2|2021-12-05| 62.23999862670898| 62.23999862670898|
|  3|2021-12-01| 64.93999862670898| 64.93999862670898|
|  3|2021-12-02|  65.1199993133545|  65.1199993133545|
|  3|2021-12-03|              null|  65.1199993133545|
|  4|2021-12-01| 64.76000137329102| 64.76000137329102|
+---+-----

                                                                                

In [None]:
window = Window\
    .partitionBy('Id')\
    .orderBy('Date')\
    .rowsBetween(Window.currentRow, Window.unboundedFollowing)

filled_cols = F.first(beer_df.F, ignorenulls=True).over(window)
beer_df = beer_df.withColumn('F[Filled]', filled_cols)

beer_df.select(['Id', 'Date', 'F', 'F[Filled]']).orderBy(['Id', 'Date']).show()



+---+----------+------------------+------------------+
| Id|      Date|                 F|         F[Filled]|
+---+----------+------------------+------------------+
|  1|2021-12-01|              68.0|              68.0|
|  1|2021-12-02| 68.36000137329103| 68.36000137329103|
|  1|2021-12-03|              null| 68.53999862670898|
|  1|2021-12-04| 68.53999862670898| 68.53999862670898|
|  1|2021-12-05|              68.9|              68.9|
|  2|2021-12-01|              61.7|              61.7|
|  2|2021-12-02| 61.51999931335449| 61.51999931335449|
|  2|2021-12-03|              61.7|              61.7|
|  2|2021-12-04|              null|62.060001373291016|
|  2|2021-12-05|62.060001373291016|62.060001373291016|
|  2|2021-12-05| 62.23999862670898| 62.23999862670898|
|  3|2021-12-01| 64.93999862670898| 64.93999862670898|
|  3|2021-12-02|  65.1199993133545|  65.1199993133545|
|  3|2021-12-03|              null|              null|
|  4|2021-12-01| 64.76000137329102| 64.76000137329102|
+---+-----

                                                                                

In [None]:
cols = []

for i in range(1, 3):
    new_col = f'F_-{i}'
    cols.append(new_col)

    window = Window\
        .partitionBy('Id')\
        .orderBy('Date')\
        .rowsBetween(Window.currentRow -i, Window.currentRow -i)

    lag_col = F.first(beer_df['F[Filled]'], ignorenulls=True).over(window)
    beer_df = beer_df.withColumn(new_col, lag_col)

beer_df.select(['Id', 'Date', 'F[Filled]', *cols]).orderBy(['Id', 'Date']).show()