# Operations and Actions with DataFrame

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType

master = 'spark://192.168.2.102:7077' # Connect to remote server
appName = 'Operations and Actions with DataFrame'

spark = SparkSession.builder.appName(appName).master(master).getOrCreate()

In [10]:
temp_hist_schema = StructType([
    StructField('Date', DateType()),
    StructField('BeerId', IntegerType()),
    StructField('Temp', FloatType()),
])

beer_schema = StructType([
    StructField('Id', IntegerType()),
    StructField('InitialDate', DateType()),
    StructField('Type', StringType()),
])
 

temp_hist_df = spark.read.csv('data/beer_temp_hist.txt', sep=';', schema=temp_hist_schema)
beer_description_df = spark.read.csv('data/beer.csv', schema=beer_schema, header=True)

temp_hist_df.show()
beer_description_df.show() 

+----------+------+----+
|      Date|BeerId|Temp|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|null|
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
|2021-12-01|     2|16.5|
|2021-12-02|     2|16.4|
|2021-12-03|     2|16.5|
|2021-12-04|     2|null|
|2021-12-05|     2|16.8|
|2021-12-05|     2|16.7|
|2021-12-01|     3|18.3|
|2021-12-02|     3|18.4|
|2021-12-03|     3|null|
|2021-12-01|     4|18.2|
+----------+------+----+

+---+-----------+--------+
| Id|InitialDate|    Type|
+---+-----------+--------+
|  1| 2021-12-01|   Laget|
|  2| 2021-12-01|Pale Ale|
|  3| 2021-12-01|    null|
|  4| 2021-12-01|     Ipa|
+---+-----------+--------+



## Rename column

You can rename a columns with 'withColumnRenamed('ColumnName', 'NewColumnName')' method When the columns is renamed, a new DataFrame is created. Remember, spark is lazy, and will only compute the new DataFrame if an action is called, in this case, 'show'. 

You can also chain operations like in the second example. In the second we will atribute the new DataFrame into the same variable, so the garbage collector will delete the older temp_hist_df.

In [11]:
temp_hist_renamend_df = temp_hist_df.withColumnRenamed('Temp', 'C')
temp_hist_renamend_df.show(5)

+----------+------+----+
|      Date|BeerId|   C|
+----------+------+----+
|2021-12-01|     1|20.0|
|2021-12-02|     1|20.2|
|2021-12-03|     1|null|
|2021-12-04|     1|20.3|
|2021-12-05|     1|20.5|
+----------+------+----+
only showing top 5 rows



In [12]:
temp_hist_df = temp_hist_df\
    .withColumnRenamed('Temp', 'C')\
    .withColumnRenamed('BeerId', 'Id')

temp_hist_df.show(5)

+----------+---+----+
|      Date| Id|   C|
+----------+---+----+
|2021-12-01|  1|20.0|
|2021-12-02|  1|20.2|
|2021-12-03|  1|null|
|2021-12-04|  1|20.3|
|2021-12-05|  1|20.5|
+----------+---+----+
only showing top 5 rows



## Create new columns

Create columns is important to create new features, and we can create columns doing operations and adding to the DataFrame with 'withColumn' method. In the example below, qe convert Celsius to Fahrenheit.

$ F = C * 1.8 + 32$

In [13]:
# Convert C to F
F = temp_hist_df.C * 1.8 + 32

# Create a new DataFrame with F column
temp_hist_df = temp_hist_df.withColumn('F', F)

temp_hist_df.show(5)

+----------+---+----+-----------------+
|      Date| Id|   C|                F|
+----------+---+----+-----------------+
|2021-12-01|  1|20.0|             68.0|
|2021-12-02|  1|20.2|68.36000137329103|
|2021-12-03|  1|null|             null|
|2021-12-04|  1|20.3|68.53999862670898|
|2021-12-05|  1|20.5|             68.9|
+----------+---+----+-----------------+
only showing top 5 rows



## Drop existent column

You can discard columns that you will not use by calling 'drop' method

In [14]:
temp_hist_without_f_df = temp_hist_df.drop('F')
temp_hist_without_f_df.show(5)

+----------+---+----+
|      Date| Id|   C|
+----------+---+----+
|2021-12-01|  1|20.0|
|2021-12-02|  1|20.2|
|2021-12-03|  1|null|
|2021-12-04|  1|20.3|
|2021-12-05|  1|20.5|
+----------+---+----+
only showing top 5 rows



In [15]:
temp_hist_ids_df = temp_hist_df\
    .drop('F')\
    .drop('C')\
    .drop('Date')

temp_hist_ids_df.show(5)

+---+
| Id|
+---+
|  1|
|  1|
|  1|
|  1|
|  1|
+---+
only showing top 5 rows



## Merge DataFrames

An import step to create features is to cross the date between two DataFrames.

In [17]:
beer_df = beer_description_df.join(temp_hist_df, on='Id', how='inner')
beer_df.show()

+---+-----------+--------+----------+----+------------------+
| Id|InitialDate|    Type|      Date|   C|                 F|
+---+-----------+--------+----------+----+------------------+
|  1| 2021-12-01|   Laget|2021-12-01|20.0|              68.0|
|  1| 2021-12-01|   Laget|2021-12-02|20.2| 68.36000137329103|
|  1| 2021-12-01|   Laget|2021-12-03|null|              null|
|  1| 2021-12-01|   Laget|2021-12-04|20.3| 68.53999862670898|
|  1| 2021-12-01|   Laget|2021-12-05|20.5|              68.9|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|              61.7|
|  2| 2021-12-01|Pale Ale|2021-12-02|16.4| 61.51999931335449|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|              61.7|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|              null|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8| 62.23999862670898|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|62.060001373291016|
|  3| 2021-12-01|    null|2021-12-01|18.3| 64.93999862670898|
|  3| 2021-12-01|    null|2021-12-02|18.4|  65.1199993133545|
|  3| 20