In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
sc = SparkSession.builder.appName("data-types").getOrCreate()

In [3]:
file_path = "/home/viethoang/petproject/20202/BigData101/data/retail.csv"

In [5]:
df = sc.read.format("csv").option("header","true").option("inferSchema","true").load(file_path)

In [6]:
df.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [10]:
#convert native type to spark type
from pyspark.sql.functions import lit
df.select(lit(5),lit("five"),lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [16]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5,False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



### Working with string

In [17]:
# in Python
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()#viet hoa chu cai dau tien cua moi tu

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit("    HELLO   ")).alias("ltrim"),
rtrim(lit("    HELLO   ")).alias("rtrim"),
trim(lit("     HELLO   ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

+--------+---------+-----+---+----------+
|   ltrim|    rtrim| trim| lp|        rp|
+--------+---------+-----+---+----------+
|HELLO   |    HELLO|HELLO|HEL|HELLO     |
|HELLO   |    HELLO|HELLO|HEL|HELLO     |
+--------+---------+-----+---+----------+
only showing top 2 rows



In [24]:
#Regex
from pyspark.sql.functions import regexp_replace
regex_string = "RED|WHITE|BLACK|BLUE"
df.select(regexp_replace(col("Description"),regex_string,"COLOR").alias("color_clean"),col("Description")).show()

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|KNITTED UNION FLA...|
|COLOR WOOLLY HOTT...|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|GLASS STAR FROSTE...|
|HAND WARMER UNION...|HAND WARMER UNION...|
|HAND WARMER COLOR...|HAND WARMER RED P...|
|ASSORTED COLOUR B...|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|FELTCRAFT PRINCES...|
|IVORY KNITTED MUG...|IVORY KNITTED MUG...|
|BOX OF 6 ASSORTED...|BOX OF 6 ASSORTED...|
|BOX OF VINTAGE JI...|BOX OF VINTAGE JI...|
|BOX OF VINTAGE AL...|BOX OF VINTAGE AL...|
|HOME BUILDING BLO...|HOME BUILDING BLO...|
|LOVE BUILDING BLO...|LOVE BUILDING BLO...|
|RECIPE BOX WITH M...|RECIPE BOX

In [25]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [30]:
#Working with Datetime
from pyspark.sql.functions import current_date, current_timestamp,date_add,date_sub
dateDF = sc.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [31]:
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2021-05-26|        2021-06-05|
+------------------+------------------+
only showing top 1 row



In [32]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today"))).show(1)
dateDF.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [36]:
from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = sc.range(1).select(
to_date(lit("2017-12-11"), dateFormat).alias("date"),
to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [None]:
#select the first non-null value from a set of columns
from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()
#may cái null,na này vẫn giống trong pandas

### Working with complex types
* Struct
* Array
* Map

In [39]:
#Struct
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [45]:
complexDF.show(5,False)
complexDF.select(col("complex").getField("Description")).show(5,False)

+---------------------------------------------+
|complex                                      |
+---------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365} |
|{WHITE METAL LANTERN, 536365}                |
|{CREAM CUPID HEARTS COAT HANGER, 536365}     |
|{KNITTED UNION FLAG HOT WATER BOTTLE, 536365}|
|{RED WOOLLY HOTTIE WHITE HEART., 536365}     |
+---------------------------------------------+
only showing top 5 rows

+-----------------------------------+
|complex.Description                |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
+-----------------------------------+
only showing top 5 rows



In [49]:
#Array and its function
#Split
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2,False)
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+----------------------------------------+
|split(Description,  , -1)               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [52]:
#array length
#array_contains
print("******Array contains test*******")
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

******Array contains test*******
+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



In [57]:
# The explode function takes a column that consists of arrays and creates one row 
# (with the rest of the values duplicated)
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show(10,False)

+----------------------------------+---------+--------+
|Description                       |InvoiceNo|exploded|
+----------------------------------+---------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HEART   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |T-LIGHT |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HOLDER  |
|WHITE METAL LANTERN               |536365   |WHITE   |
|WHITE METAL LANTERN               |536365   |METAL   |
|WHITE METAL LANTERN               |536365   |LANTERN |
|CREAM CUPID HEARTS COAT HANGER    |536365   |CREAM   |
|CREAM CUPID HEARTS COAT HANGER    |536365   |CUPID   |
+----------------------------------+---------+--------+
only showing top 10 rows



In [65]:
#Map
from pyspark.sql.functions import create_map,col
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.show(2)
#select as key-value
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [67]:
#json
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show()

+--------------------+
|   to_json(myStruct)|
+--------------------+
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
+--------------------+
only showing top 20 rows

