# Operations and Actions with DataFrame

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType

spark = SparkSession.builder\
    .master('spark://192.168.2.102:7077')\
    .appName('Operations and Actions with DataFrame')\
    .config("spark.cores.max", "1")\
    .config("spark.executor.memory", "1g")\
    .getOrCreate()


21/12/22 15:00:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/22 15:00:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/12/22 15:00:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
temp_hist_schema = StructType([
    StructField('Date', DateType()),
    StructField('BeerId', IntegerType()),
    StructField('Temp', FloatType()),
])

beer_schema = StructType([
    StructField('Id', IntegerType()),
    StructField('InitialDate', DateType()),
    StructField('Type', StringType()),
])
 

temp_hist_df = spark.read.csv('data/beer_temp_hist.txt', sep=';', schema=temp_hist_schema)
beer_description_df = spark.read.csv('data/beer.csv', schema=beer_schema, header=True)

temp_hist_df.show()
beer_description_df.show() 

                                                                                

+----------+------+----+
|      Date|BeerId|Temp|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|null|
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
|2021-12-01|     2|16.5|
|2021-12-02|     2|16.4|
|2021-12-03|     2|16.5|
|2021-12-04|     2|null|
|2021-12-05|     2|16.8|
|2021-12-05|     2|16.7|
|2021-12-01|     3|18.3|
|2021-12-02|     3|18.4|
|2021-12-03|     3|null|
|2021-12-01|     4|18.2|
+----------+------+----+

+---+-----------+--------+
| Id|InitialDate|    Type|
+---+-----------+--------+
|  1| 2021-12-01|   Laget|
|  2| 2021-12-01|Pale Ale|
|  3| 2021-12-01|    null|
|  4| 2021-12-01|     Ipa|
+---+-----------+--------+



## Rename column

You can rename a columns with 'withColumnRenamed('ColumnName', 'NewColumnName')' method When the columns is renamed, a new DataFrame is created. Remember, spark is lazy, and will only compute the new DataFrame if an action is called, in this case, 'show'. 

You can also chain operations like in the second example. In the second we will atribute the new DataFrame into the same variable, so the garbage collector will delete the older temp_hist_df.

In [3]:
temp_hist_renamend_df = temp_hist_df.withColumnRenamed('Temp', 'C')
temp_hist_renamend_df.show(5)

+----------+------+----+
|      Date|BeerId|   C|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|null|
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
+----------+------+----+
only showing top 5 rows



In [4]:
temp_hist_df = temp_hist_df\
    .withColumnRenamed('Temp', 'C')\
    .withColumnRenamed('BeerId', 'Id')

temp_hist_df.show(5)

+----------+---+----+
|      Date| Id|   C|
+----------+---+----+
|2021-12-01|  1|20.0|
|2021-12-02|  1|20.2|
|2021-12-03|  1|null|
|2021-12-04|  1|20.3|
|2021-12-05|  1|20.5|
+----------+---+----+
only showing top 5 rows



## Create new columns

Create columns is important to create new features, and we can create columns doing operations and adding to the DataFrame with 'withColumn' method. In the example below, qe convert Celsius to Fahrenheit.

$ F = C * 1.8 + 32$

In [5]:
# Convert C to F
F = temp_hist_df.C * 1.8 + 32

# Create a new DataFrame with F column
temp_hist_df = temp_hist_df.withColumn('F', F)

temp_hist_df.show(5)

+----------+---+----+-----------------+
|      Date| Id|   C|                F|
+----------+---+----+-----------------+
|2021-12-01|  1|20.0|             68.0|
|2021-12-02|  1|20.2|68.36000137329103|
|2021-12-03|  1|null|             null|
|2021-12-04|  1|20.3|68.53999862670898|
|2021-12-05|  1|20.5|             68.9|
+----------+---+----+-----------------+
only showing top 5 rows



## Drop existent column

You can discard columns that you will not use by calling 'drop' method

In [6]:
temp_hist_without_f_df = temp_hist_df.drop('F')
temp_hist_without_f_df.show(5)

+----------+---+----+
|      Date| Id|   C|
+----------+---+----+
|2021-12-01|  1|20.0|
|2021-12-02|  1|20.2|
|2021-12-03|  1|null|
|2021-12-04|  1|20.3|
|2021-12-05|  1|20.5|
+----------+---+----+
only showing top 5 rows



In [7]:
temp_hist_ids_df = temp_hist_df\
    .drop('F')\
    .drop('C')\
    .drop('Date')

temp_hist_ids_df.show(5)

+---+
| Id|
+---+
|  1|
|  1|
|  1|
|  1|
|  1|
+---+
only showing top 5 rows



## Merge DataFrames

An import step to create features is to cross the date between two DataFrames.
* inner
* left (leftouter, left_outer)
* right (rightouter, right_outer)
* cross
* full (fullouter, full_outer)
* semi (leftsemi, left_semi)
* anti (leftanti, left_anti)

In [8]:
df1 = spark.createDataFrame([
    [1, 'EX_1'],
    [2, 'EX_2'],
    [3, 'EX_3'],
    [7, 'EX_7'],
], schema=['Id', 'Description'])

df2 = spark.createDataFrame([
    [1, 1],
    [2, 2],
    [2, 2],
    [3, 3],
    [3, 3],
    [4, 4],
    [5, 5],
], schema=['Id', 'Value'])

In [None]:
df1.join(df2, on='Id', how='inner').show()



+---+-----------+-----+
| Id|Description|Value|
+---+-----------+-----+
|  1|       EX_1|    1|
|  3|       EX_3|    3|
|  3|       EX_3|    3|
|  2|       EX_2|    2|
|  2|       EX_2|    2|
+---+-----------+-----+



                                                                                

In [None]:
df1.join(df2, on='Id', how='left').show()



+---+-----------+-----+
| Id|Description|Value|
+---+-----------+-----+
|  7|       EX_7| null|
|  1|       EX_1|    1|
|  3|       EX_3|    3|
|  3|       EX_3|    3|
|  2|       EX_2|    2|
|  2|       EX_2|    2|
+---+-----------+-----+



                                                                                

In [None]:
df1.join(df2, on='Id', how='right').show()

                                                                                

+---+-----------+-----+
| Id|Description|Value|
+---+-----------+-----+
|  5|       null|    5|
|  1|       EX_1|    1|
|  3|       EX_3|    3|
|  3|       EX_3|    3|
|  2|       EX_2|    2|
|  2|       EX_2|    2|
|  4|       null|    4|
+---+-----------+-----+



                                                                                

In [None]:
# For each row of df1, we will have a row of df2
df1.join(df2, how='cross').show()

+---+-----------+---+-----+
| Id|Description| Id|Value|
+---+-----------+---+-----+
|  1|       EX_1|  1|    1|
|  1|       EX_1|  2|    2|
|  1|       EX_1|  2|    2|
|  1|       EX_1|  3|    3|
|  1|       EX_1|  3|    3|
|  1|       EX_1|  4|    4|
|  1|       EX_1|  5|    5|
|  2|       EX_2|  1|    1|
|  2|       EX_2|  2|    2|
|  2|       EX_2|  2|    2|
|  2|       EX_2|  3|    3|
|  2|       EX_2|  3|    3|
|  2|       EX_2|  4|    4|
|  2|       EX_2|  5|    5|
|  3|       EX_3|  1|    1|
|  3|       EX_3|  2|    2|
|  3|       EX_3|  2|    2|
|  3|       EX_3|  3|    3|
|  3|       EX_3|  3|    3|
|  3|       EX_3|  4|    4|
+---+-----------+---+-----+
only showing top 20 rows



In [None]:
# Return all rows that match in both DataFrames
df1.join(df2, how='full', on='Id').show()

                                                                                

+---+-----------+-----+
| Id|Description|Value|
+---+-----------+-----+
|  7|       EX_7| null|
|  5|       null|    5|
|  1|       EX_1|    1|
|  3|       EX_3|    3|
|  3|       EX_3|    3|
|  2|       EX_2|    2|
|  2|       EX_2|    2|
|  4|       null|    4|
+---+-----------+-----+



                                                                                

In [None]:
# Return columns from left DataFrame that matches columns with right DataFrame
df1.join(df2, how='semi', on='Id').show()

                                                                                

+---+-----------+
| Id|Description|
+---+-----------+
|  1|       EX_1|
|  3|       EX_3|
|  2|       EX_2|
+---+-----------+



In [None]:
# Return columns from left DataFrame that don't matches columns with right DataFrame
df1.join(df2, how='anti', on='Id').show()

+---+-----------+
| Id|Description|
+---+-----------+
|  7|       EX_7|
+---+-----------+



In [None]:
beer_df = beer_description_df.join(temp_hist_df, on='Id', how='inner')
beer_df.show()

+---+-----------+--------+----------+----+------------------+
| Id|InitialDate|    Type|      Date|   C|                 F|
+---+-----------+--------+----------+----+------------------+
|  1| 2021-12-01|   Laget|2021-12-01|20.0|              68.0|
|  1| 2021-12-01|   Laget|2021-12-02|20.2| 68.36000137329103|
|  1| 2021-12-01|   Laget|2021-12-03|null|              null|
|  1| 2021-12-01|   Laget|2021-12-04|20.3| 68.53999862670898|
|  1| 2021-12-01|   Laget|2021-12-05|20.5|              68.9|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|              61.7|
|  2| 2021-12-01|Pale Ale|2021-12-02|16.4| 61.51999931335449|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|
|  3| 2021-12-01|    null|2021-12-01|18.3| 64.93999862670898|
|  3| 2021-12-01|    null|2021-12-02|18.4|  65.1199993133545|
|  3| 20

# PySpark functions

pyspark.sql.functions is a collection of built-in functions.


In [None]:
import pyspark.sql.functions as F

In [None]:
elapsed_days = F.datediff(beer_df.Date, beer_df.InitialDate)
beer_df = beer_df.withColumn('ElapsedDays', elapsed_days)
beer_df.show(5)

+---+-----------+-----+----------+----+-----------------+-----------+
| Id|InitialDate| Type|      Date|   C|                F|ElapsedDays|
+---+-----------+-----+----------+----+-----------------+-----------+
|  1| 2021-12-01|Laget|2021-12-01|20.0|             68.0|          0|
|  1| 2021-12-01|Laget|2021-12-02|20.2|68.36000137329103|          1|
|  1| 2021-12-01|Laget|2021-12-03|null|             null|          2|
|  1| 2021-12-01|Laget|2021-12-04|20.3|68.53999862670898|          3|
|  1| 2021-12-01|Laget|2021-12-05|20.5|             68.9|          4|
+---+-----------+-----+----------+----+-----------------+-----------+
only showing top 5 rows



In [None]:
beer_df = beer_df.withColumn('FirstDay', beer_df.ElapsedDays == 0)
beer_df.show(5)

+---+-----------+-----+----------+----+-----------------+-----------+--------+
| Id|InitialDate| Type|      Date|   C|                F|ElapsedDays|FirstDay|
+---+-----------+-----+----------+----+-----------------+-----------+--------+
|  1| 2021-12-01|Laget|2021-12-01|20.0|             68.0|          0|    true|
|  1| 2021-12-01|Laget|2021-12-02|20.2|68.36000137329103|          1|   false|
|  1| 2021-12-01|Laget|2021-12-03|null|             null|          2|   false|
|  1| 2021-12-01|Laget|2021-12-04|20.3|68.53999862670898|          3|   false|
|  1| 2021-12-01|Laget|2021-12-05|20.5|             68.9|          4|   false|
+---+-----------+-----+----------+----+-----------------+-----------+--------+
only showing top 5 rows



## Sort DataFrame

In [None]:
beer_df.orderBy('C', ascending=False).show()

+---+-----------+--------+----------+----+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+------------------+-----------+--------+
|  1| 2021-12-01|   Laget|2021-12-05|20.5|              68.9|          4|   false|
|  1| 2021-12-01|   Laget|2021-12-04|20.3| 68.53999862670898|          3|   false|
|  1| 2021-12-01|   Laget|2021-12-02|20.2| 68.36000137329103|          1|   false|
|  1| 2021-12-01|   Laget|2021-12-01|20.0|              68.0|          0|    true|
|  3| 2021-12-01|    null|2021-12-02|18.4|  65.1199993133545|          1|   false|
|  3| 2021-12-01|    null|2021-12-01|18.3| 64.93999862670898|          0|    true|
|  4| 2021-12-01|     Ipa|2021-12-01|18.2| 64.76000137329102|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|          4|   false|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|          4|   false|
|  2

In [None]:
beer_df.orderBy(['Id', 'C'], ascending=[False, True]).show()

+---+-----------+--------+----------+----+------------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                 F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+------------------+-----------+--------+
|  4| 2021-12-01|     Ipa|2021-12-01|18.2| 64.76000137329102|          0|    true|
|  3| 2021-12-01|    null|2021-12-03|null|              null|          2|   false|
|  3| 2021-12-01|    null|2021-12-01|18.3| 64.93999862670898|          0|    true|
|  3| 2021-12-01|    null|2021-12-02|18.4|  65.1199993133545|          1|   false|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|          3|   false|
|  2| 2021-12-01|Pale Ale|2021-12-02|16.4| 61.51999931335449|          1|   false|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|          2|   false|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|              61.7|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|          4|   false|
|  2

## Drop Duplicated Values

In [None]:
beer_df.dropDuplicates(['Id']).show()

                                                                                

+---+-----------+--------+----------+----+-----------------+-----------+--------+
| Id|InitialDate|    Type|      Date|   C|                F|ElapsedDays|FirstDay|
+---+-----------+--------+----------+----+-----------------+-----------+--------+
|  1| 2021-12-01|   Laget|2021-12-01|20.0|             68.0|          0|    true|
|  3| 2021-12-01|    null|2021-12-01|18.3|64.93999862670898|          0|    true|
|  4| 2021-12-01|     Ipa|2021-12-01|18.2|64.76000137329102|          0|    true|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|             61.7|          0|    true|
+---+-----------+--------+----------+----+-----------------+-----------+--------+

