In [87]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BasicDFOperationsAppV2").master("spark://spark-master:7077").getOrCreate() 
df = spark.read.format("csv")\
          .option("header", "true")\
          .option("inferSchema", "true")\
          .load("/home/jovyan/data/2010-12-01.csv")
df.printSchema()
df.show(5)

                                                                                

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   8

We can use lit function to covert the data types in the programing language to the corresponding spark data types, String to StringType, etc.  Add literal or constant to a data frame.

In [88]:
from pyspark.sql.functions import lit, col
df.select("*", lit(5), lit("five"), lit(5.0)).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---+----+---+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|  5|five|5.0|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---+----+---+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|  5|five|5.0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|five|5.0|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|  5|five|5.0|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|five|5.0|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|five|5.0|
+---------+---------+-----------

We can use logical operations to build the Boolean expression for filtering data with the where function.

In [89]:
from pyspark.sql.functions import col

df.where("InvoiceNo != 536365")\
.select("InvoiceNo", "Description")\
.show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



We can build more complex Boolean expressions with Spark functions and with a series of where functions. 
The function **instr** - Locate the position of the first occurrence of substr column in the given string.

In [90]:
from pyspark.sql.functions import instr

priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1

df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



We can use the expressions as part of the select function. The **alias** function - returns this column aliased with a new name or names

In [91]:
from pyspark.sql.functions import expr, pow
newQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), newQuantity).show(2)
df.select(expr("CustomerId"), newQuantity.alias("QuantityX")).show(2)

+----------+----------------------------------------+
|CustomerId|(POWER((Quantity * UnitPrice), 2.0) + 5)|
+----------+----------------------------------------+
|   17850.0|                      239.08999999999997|
|   17850.0|                                418.7156|
+----------+----------------------------------------+
only showing top 2 rows

+----------+------------------+
|CustomerId|         QuantityX|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [92]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as QuantityX").show(2)

+----------+------------------+
|CustomerId|         QuantityX|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



We can round off the numerical data in rows.

In [93]:
from pyspark.sql.functions import lit, round, bround
df.select("*", round(lit("2.5")), bround(lit("2.5"))).show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|round(2.5, 0)|bround(2.5, 0)|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|          3.0|           2.0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|          3.0|           2.0|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------------+--------------+
only showing top 2 rows



We can find the correlation between the data in two columns. 

In [94]:
from pyspark.sql.functions import corr

df.select(corr("Quantity", "UnitPrice").alias("Quan_Price_Corr")).show()


+--------------------+
|     Quan_Price_Corr|
+--------------------+
|-0.04112314436835551|
+--------------------+



Spark provides some more useful statistics functions. See at 

https://spark.apache.org/docs/2.4.4/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions
    
crosstab(col1, col2)  Computes a pair-wise frequency table of the given columns
freqItems(cols, support=None) Finding frequent items for columns, possibly with false positives. 

In [95]:
df.stat.freqItems(["StockCode", "Quantity"]).show(5)
df.stat.crosstab("StockCode", "Quantity").show(5)

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



                                                                                

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

Change from lowercase to upper case and vice versa.

In [96]:
from pyspark.sql.functions import lower, upper

df.select(col("Description"),
lower(col("Description")),
upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



We can remove the white spaces in text.

In [97]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

+------+------+-----+---+----------+
| ltrim| rtrim| trim| lp|        rp|
+------+------+-----+---+----------+
|HELLO | HELLO|HELLO|HEL|HELLO     |
|HELLO | HELLO|HELLO|HEL|HELLO     |
+------+------+-----+---+----------+
only showing top 2 rows



We can use regular expression (Java regular expression syntax) for extracting and replacing text fragments.

In [98]:
from pyspark.sql.functions import regexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



We can add/use current date and time in the data set. **range()** - create a DataFrame with single LongType column named id, containing elements in a range. **toDF()** - Returns a new DataFrame that with new specified column names

In [99]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)
dateDF.show(3, False)
dateDF = dateDF.toDF("RID")\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.printSchema()
dateDF.show (3, False)

+---+
|id |
+---+
|0  |
|1  |
|2  |
+---+
only showing top 3 rows

root
 |-- RID: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)

+---+----------+--------------------------+
|RID|today     |now                       |
+---+----------+--------------------------+
|0  |2021-10-20|2021-10-20 20:39:04.328471|
|1  |2021-10-20|2021-10-20 20:39:04.328471|
|2  |2021-10-20|2021-10-20 20:39:04.328471|
+---+----------+--------------------------+
only showing top 3 rows



We can add/subtract dates. 

In [100]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2021-10-15|        2021-10-25|
+------------------+------------------+
only showing top 1 row



We can take the difference between dates. 

In [101]:
from pyspark.sql.functions import datediff
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



 We can convert the data or timestamp string data to the corresponding Spark data type.

In [102]:
from pyspark.sql.functions import to_date, lit

dfdate = spark.range(5).withColumn("date", lit("2017-01-01"))\
.select(to_date(col("date")))
dfdate.printSchema()
dfdate.show(3, False)

root
 |-- to_date(date): date (nullable = true)

+-------------+
|to_date(date)|
+-------------+
|2017-01-01   |
|2017-01-01   |
|2017-01-01   |
+-------------+
only showing top 3 rows



COALESCE - Return the first non-null value in a list (here, data in columns, i.e., Returns the first column (among a set of columns) that is not null.) https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.coalesce.html

In [103]:
from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description"), col("CustomerId"))).show(10, False)

+-----------------------------------+
|coalesce(Description, CustomerId)  |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
|SET 7 BABUSHKA NESTING BOXES       |
|GLASS STAR FROSTED T-LIGHT HOLDER  |
|HAND WARMER UNION JACK             |
|HAND WARMER RED POLKA DOT          |
|ASSORTED COLOUR BIRD ORNAMENT      |
+-----------------------------------+
only showing top 10 rows



We can remove null values with drop function. We can consider a subset of columns. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.

In [104]:
df.na.drop()
df.na.drop("any")
df.na.drop("all")
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

We can also fill null values. 

In [105]:
df.na.fill("All Null values become this string")
df.na.fill(5)
fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

We can also replace some text with another text. 

In [106]:
df.na.replace([""], ["UNKNOWN"], "Description").show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



We can create nested dataframes/tables with struct function. 

In [107]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.printSchema()
complexDF.show(2, False)

root
 |-- complex: struct (nullable = false)
 |    |-- Description: string (nullable = true)
 |    |-- InvoiceNo: string (nullable = true)

+--------------------------------------------+
|complex                                     |
+--------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365}|
|{WHITE METAL LANTERN, 536365}               |
+--------------------------------------------+
only showing top 2 rows



We can also flatten nested dataframe with the explode function.

In [108]:
df3 = complexDF.select("complex.*")
df3.printSchema()
df3.show(2, False)

root
 |-- Description: string (nullable = true)
 |-- InvoiceNo: string (nullable = true)

+----------------------------------+---------+
|Description                       |InvoiceNo|
+----------------------------------+---------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |
|WHITE METAL LANTERN               |536365   |
+----------------------------------+---------+
only showing top 2 rows



We can create arrays by splitting a text, check the size of an array, and check if an item is in an array. 

In [109]:
from pyspark.sql.functions import split, size, array_contains
df.select(split(col("Description"), " ").alias("array_col")).show(2)

df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)
df.select(size(split(col("Description"), " "))).show(2)
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+--------------------+
|           array_col|
+--------------------+
|[WHITE, HANGING, ...|
|[WHITE, METAL, LA...|
+--------------------+
only showing top 2 rows

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



We can also explode/flatten an array.

In [110]:
from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " ")).show(2)

df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|            splitted|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|[WHITE, HANGING, ...|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|[WHITE, METAL, LA...|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
only showing top 2 rows

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| 

We can also create maps. We can also access the data in a map via a key. 

In [111]:
from pyspark.sql.functions import create_map
df4 = df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))
df4.show(2,False)
df4.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365}|
|{WHITE METAL LANTERN -> 536365}               |
+----------------------------------------------+
only showing top 2 rows

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



We can explode maps as well.

In [112]:
df4.selectExpr("explode(complex_map)").show(2, False)

+----------------------------------+------+
|key                               |value |
+----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365|
|WHITE METAL LANTERN               |536365|
+----------------------------------+------+
only showing top 2 rows



Spark provides specific support for JSON data. We can convert a struct into a JOSN string and vise versa.

In [113]:
from pyspark.sql.functions import from_json, to_json
from pyspark.sql.types import *

parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))

df2 = df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema).alias("NewStruct"), col("newJSON"))
df2.printSchema()
df2.show(2, False)

root
 |-- NewStruct: struct (nullable = true)
 |    |-- InvoiceNo: string (nullable = true)
 |    |-- Description: string (nullable = true)
 |-- newJSON: string (nullable = true)

+--------------------------------------------+-------------------------------------------------------------------------+
|NewStruct                                   |newJSON                                                                  |
+--------------------------------------------+-------------------------------------------------------------------------+
|{536365, WHITE HANGING HEART T-LIGHT HOLDER}|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
|{536365, WHITE METAL LANTERN}               |{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}               |
+--------------------------------------------+-------------------------------------------------------------------------+
only showing top 2 rows



We also define and use our own custom functions as part of transformations (within the expressions). 

In [114]:
# Stop the spark context
spark.stop()