# Data Types

In [2]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml.stat as ml_stat
import pyspark.sql.functions as func
import pyspark.sql.types as types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [3]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

In [4]:
# Create Data Frame from a csv file with inferred schema
df_auto = spark.read.csv('Auto.csv', header=True, inferSchema=True)
df_auto.show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



### Date and Time

In [5]:
# Current date and time
df = spark.range(10) \
    .withColumn('date', func.current_date()) \
    .withColumn('timestamp', func.current_timestamp())
df.show(5)

+---+----------+--------------------+
| id|      date|           timestamp|
+---+----------+--------------------+
|  0|2020-05-07|2020-05-07 19:42:...|
|  1|2020-05-07|2020-05-07 19:42:...|
|  2|2020-05-07|2020-05-07 19:42:...|
|  3|2020-05-07|2020-05-07 19:42:...|
|  4|2020-05-07|2020-05-07 19:42:...|
+---+----------+--------------------+
only showing top 5 rows



In [6]:
# Adding and subtracting dates
df.withColumn('tomorrow', func.date_add('date', 1)) \
    .withColumn('yesterday', func.date_add('date', -1)) \
.show(5)

+---+----------+--------------------+----------+----------+
| id|      date|           timestamp|  tomorrow| yesterday|
+---+----------+--------------------+----------+----------+
|  0|2020-05-07|2020-05-07 19:42:...|2020-05-08|2020-05-06|
|  1|2020-05-07|2020-05-07 19:42:...|2020-05-08|2020-05-06|
|  2|2020-05-07|2020-05-07 19:42:...|2020-05-08|2020-05-06|
|  3|2020-05-07|2020-05-07 19:42:...|2020-05-08|2020-05-06|
|  4|2020-05-07|2020-05-07 19:42:...|2020-05-08|2020-05-06|
+---+----------+--------------------+----------+----------+
only showing top 5 rows



In [7]:
# Date from string of the form yyyy-mm-dd (in sql)
df.withColumn('date_from_string', func.expr("cast('2020-06-01' as date)")).show(5)

+---+----------+--------------------+----------------+
| id|      date|           timestamp|date_from_string|
+---+----------+--------------------+----------------+
|  0|2020-05-07|2020-05-07 19:42:...|      2020-06-01|
|  1|2020-05-07|2020-05-07 19:42:...|      2020-06-01|
|  2|2020-05-07|2020-05-07 19:42:...|      2020-06-01|
|  3|2020-05-07|2020-05-07 19:42:...|      2020-06-01|
|  4|2020-05-07|2020-05-07 19:42:...|      2020-06-01|
+---+----------+--------------------+----------------+
only showing top 5 rows



### Working with nulls

In [8]:
df = df_auto.withColumn('horsepower', func.expr('cast(horsepower as float)'))

In [9]:
df.where('horsepower is null').show()

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|25.0|        4|        98.0|      null|  2046|        19.0|  71|     1|          ford pinto|
|21.0|        6|       200.0|      null|  2875|        17.0|  74|     1|       ford maverick|
|40.9|        4|        85.0|      null|  1835|        17.3|  80|     2|renault lecar deluxe|
|23.6|        4|       140.0|      null|  2905|        14.3|  80|     1|  ford mustang cobra|
|34.5|        4|       100.0|      null|  2320|        15.8|  81|     2|         renault 18i|
+----+---------+------------+----------+------+------------+----+------+--------------------+



In [10]:
df.count()

397

In [11]:
# Remove rows where any columns has a null value
df.dropna('any').count()

392

In [12]:
# Remove rows where all columns are null
df.dropna('all').count()

397

In [13]:
# Remove rows where certain columns are null
df.dropna('all', subset=['horsepower', 'name']).count()

397

In [14]:
# Fill with column defaults
col_defaults = {
    'cylinders': 4,
    'horsepower': 0
}
df.fillna(col_defaults).where('horsepower == 0').show()

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|25.0|        4|        98.0|       0.0|  2046|        19.0|  71|     1|          ford pinto|
|21.0|        6|       200.0|       0.0|  2875|        17.0|  74|     1|       ford maverick|
|40.9|        4|        85.0|       0.0|  1835|        17.3|  80|     2|renault lecar deluxe|
|23.6|        4|       140.0|       0.0|  2905|        14.3|  80|     1|  ford mustang cobra|
|34.5|        4|       100.0|       0.0|  2320|        15.8|  81|     2|         renault 18i|
+----+---------+------------+----------+------+------------+----+------+--------------------+



### Complex Type: Struct

Structs are like DataFrames within DataFrames

In [20]:
# Create struct of mpg and horsepower
df_auto.select('name', func.struct('mpg', 'horsepower').alias('data')).show(5)

+--------------------+-----------+
|                name|       data|
+--------------------+-----------+
|chevrolet chevell...|[18.0, 130]|
|   buick skylark 320|[15.0, 165]|
|  plymouth satellite|[18.0, 150]|
|       amc rebel sst|[16.0, 150]|
|         ford torino|[17.0, 140]|
+--------------------+-----------+
only showing top 5 rows



In [25]:
# Create struct in sql with ()
df = df_auto.selectExpr('name', '(mpg, horsepower) as data')
df.show(5)

+--------------------+-----------+
|                name|       data|
+--------------------+-----------+
|chevrolet chevell...|[18.0, 130]|
|   buick skylark 320|[15.0, 165]|
|  plymouth satellite|[18.0, 150]|
|       amc rebel sst|[16.0, 150]|
|         ford torino|[17.0, 140]|
+--------------------+-----------+
only showing top 5 rows



In [27]:
# Select from struct
df.select('name', 'data.mpg').show(5)

+--------------------+----+
|                name| mpg|
+--------------------+----+
|chevrolet chevell...|18.0|
|   buick skylark 320|15.0|
|  plymouth satellite|18.0|
|       amc rebel sst|16.0|
|         ford torino|17.0|
+--------------------+----+
only showing top 5 rows



### Complex Type: Array

In [36]:
# Example
df = spark.createDataFrame([
    ('Joe', 'Python, Spark, Kafka'),
    ('Henry', 'React, Redux, JavaScript')],
    ['Name', 'Competence']
)
df.show()

+-----+--------------------+
| Name|          Competence|
+-----+--------------------+
|  Joe|Python, Spark, Kafka|
|Henry|React, Redux, Jav...|
+-----+--------------------+



In [41]:
# Split field and gather in array
df = df.selectExpr("Name", "split(Competence, ',') as Competence")
df.show(5)

+-----+--------------------+
| Name|          Competence|
+-----+--------------------+
|  Joe|[Python,  Spark, ...|
|Henry|[React,  Redux,  ...|
+-----+--------------------+



In [44]:
# Select array item
df.selectExpr("Name", "Competence[0]").show(5)

+-----+-------------+
| Name|Competence[0]|
+-----+-------------+
|  Joe|       Python|
|Henry|        React|
+-----+-------------+



In [46]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Competence: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [57]:
# Check if array contains item
df.selectExpr("Name", "array_contains(Competence, 'Python') as Python_Dev").show(5)

+-----+----------+
| Name|Python_Dev|
+-----+----------+
|  Joe|      true|
|Henry|     false|
+-----+----------+



In [59]:
# Same
df.select("Name", func.array_contains("Competence", "Python").alias("Python_Dev")).show(5)

+-----+----------+
| Name|Python_Dev|
+-----+----------+
|  Joe|      true|
|Henry|     false|
+-----+----------+



In [60]:
# Explode Array into multiple rows
df.withColumn('Competence', func.explode('Competence')).show(5)

+-----+----------+
| Name|Competence|
+-----+----------+
|  Joe|    Python|
|  Joe|     Spark|
|  Joe|     Kafka|
|Henry|     React|
|Henry|     Redux|
+-----+----------+
only showing top 5 rows



### Complex Types: Maps

In [62]:
# Create map in sql
df = df_auto.selectExpr('name', 'map(year, mpg) as data')
df.show(5)

+--------------------+------------+
|                name|        data|
+--------------------+------------+
|chevrolet chevell...|[70 -> 18.0]|
|   buick skylark 320|[70 -> 15.0]|
|  plymouth satellite|[70 -> 18.0]|
|       amc rebel sst|[70 -> 16.0]|
|         ford torino|[70 -> 17.0]|
+--------------------+------------+
only showing top 5 rows



### JSON Datatype

In [81]:
# Create column with JSON datatype
df = spark.range(1).selectExpr("""
    '{"prop1": "value1", "prop2": [1, 2, 3]}' as json_string
""");
df.show()

+--------------------+
|         json_string|
+--------------------+
|{"prop1": "value1...|
+--------------------+



In [88]:
# Select value from json
df.select(func.get_json_object('json_string', '$.prop2[0]').alias('value')).show()

+-----+
|value|
+-----+
|    1|
+-----+



In [90]:
# Select value from json (SQL like syntax)
df.selectExpr("get_json_object(json_string, '$.prop2[0]') as value").show()

+-----+
|value|
+-----+
|    1|
+-----+

