## Overview of basic datatypes in Spark

In [27]:
# Import libraries
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime

In [2]:
# Initiate spark session
spark = SparkSession \
        .builder \
        .appName('SparkDataTypes') \
        .getOrCreate()

In [16]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [17]:
# Data type automatically get derived even if we do not specify 
user_df = spark.createDataFrame([Row(**user) for user in users])

In [18]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [19]:
user_df.show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|        ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  5|    Rachel|    Green|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
|  6|  Chandler|     Bing|        bing@abc.com|       true|     1000.8|   2021-02-22|2021-02-25 07:33:00|
+---+----------+---------+--------------------

In [15]:
# Create dataframe from list of dict using custom name and datatype
spark.createDataFrame([Row(*user.values()) for user in users], 'id int, f_name string, l_name string, email string, is_cust boolean, amt_paid float, cust_frm date, last_datetime timestamp')

DataFrame[id: int, f_name: string, l_name: string, email: string, is_cust: boolean, amt_paid: float, cust_frm: date, last_datetime: timestamp]

### Specify schema for spark dataframe using string

In [21]:
users = [
            (
                 1,
                 "Pheobe",
                 "Buffay",
                 "pheobebuffay@abc.com",
                 True,
                 1000.55,
                 datetime.date(2021, 1, 13),
                 datetime.datetime(2021, 2, 10, 1, 15, 0)
            ),
            (
                 2,
                 "Joey",
                 "Tribbiani",
                 "joey@abc.com",
                 True,
                 900.0,
                 datetime.date(2021, 2, 14),
                 datetime.datetime(2021, 2, 18, 3, 33, 0)
            ),
            (
                 3,
                 "Monica",
                 "Geller",
                 "monica@abc.com",
                 True,
                 1000.90,
                 datetime.date(2021, 2, 22),
                 datetime.datetime(2021, 2, 28, 7, 33, 0)
            ),
            (
                 4,
                 "Ross",
                 "Geller",
                 "ross@abc.com",
                 True,
                 1200.55,
                 datetime.date(2021, 1, 19),
                 datetime.datetime(2021, 2, 18, 1, 10, 0)
            ),
            (
                 5,
                 "Rachel",
                 "Green",
                 "rachel@abc.com",
                 True,
                 None,
                 datetime.date(2021, 2, 24),
                 datetime.datetime(2021, 2, 18, 3, 33, 0)
            ),
            (
                 6,
                 "Chandler",
                 "Bing",
                 "bing@abc.com",
                 True,
                 1000.80,
                 datetime.date(2021, 2, 22),
                 datetime.datetime(2021, 2, 25, 7, 33, 0)
            )
        ]

In [22]:
user_schema = """
              id INT,
              first_name STRING,
              last_name STRING,
              email STRING,
              is_customer BOOLEAN,
              amount_paid FLOAT,
              customer_from DATE,
              last_updated_ts TIMESTAMP
              """

In [24]:
spark.createDataFrame(users, schema=user_schema).show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|        ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  5|    Rachel|    Green|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
|  6|  Chandler|     Bing|        bing@abc.com|       true|     1000.8|   2021-02-22|2021-02-25 07:33:00|
+---+----------+---------+--------------------

### Specify schema for spark dataframe using list

In [25]:
user_schema = [
              'id INT',
              'first_name STRING',
              'last_name STRING',
              'email STRING',
              'is_customer BOOLEAN',
              'amount_paid FLOAT',
              'customer_from DATE',
              'last_updated_ts TIMESTAMP'
            ]

In [26]:
spark.createDataFrame(users, schema=user_schema).show()

+------+-----------------+----------------+--------------------+-------------------+-----------------+------------------+-------------------------+
|id INT|first_name STRING|last_name STRING|        email STRING|is_customer BOOLEAN|amount_paid FLOAT|customer_from DATE|last_updated_ts TIMESTAMP|
+------+-----------------+----------------+--------------------+-------------------+-----------------+------------------+-------------------------+
|     1|           Pheobe|          Buffay|pheobebuffay@abc.com|               true|          1000.55|        2021-01-13|      2021-02-10 01:15:00|
|     2|             Joey|       Tribbiani|        joey@abc.com|               true|            900.0|        2021-02-14|      2021-02-18 03:33:00|
|     3|           Monica|          Geller|      monica@abc.com|               true|           1000.9|        2021-02-22|      2021-02-28 07:33:00|
|     4|             Ross|          Geller|        ross@abc.com|               true|          1200.55|        20

### Specify schema for spark dataframe using spark types

In [29]:
user_schema = StructType([
                  StructField('id', IntegerType()),
                  StructField('first_name', StringType()),
                  StructField('last_name', StringType()),
                  StructField('email', StringType()),
                  StructField('is_customer', BooleanType()),
                  StructField('amount_paid', FloatType()),
                  StructField('customer_from', DateType()),
                  StructField('last_updated_ts', TimestampType()),
              ])

In [30]:
spark.createDataFrame(users, schema=user_schema)

DataFrame[id: int, first_name: string, last_name: string, email: string, is_customer: boolean, amount_paid: float, customer_from: date, last_updated_ts: timestamp]

In [32]:
type(user_schema)

pyspark.sql.types.StructType

In [31]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True) method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of either :class:`Row`,
    :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or an exception will be thrown at runtime. If the given schema is not
    :class:`pyspark.sql.types.StructType`, it will be wrapped into a
    :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value".
    Each record will also be wrapped into a tu