In [19]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [20]:
# define schema for our data
schema = StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)])


We can also define schema using **Data Definition Language** (DDL) as shown below.

In [21]:
schema_DDL = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, \
`Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

Create sample data.

In [22]:
#create ouexpressionsr data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

Create a Spark Session

In [23]:
spark = (SparkSession
       .builder
       .appName("DF Schema example")
       .getOrCreate())

23/11/14 12:14:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/11/14 12:14:47 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/11/14 12:14:47 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Create a DataFrame using the schema defined above


In [24]:
blogs_df = spark.createDataFrame(data, schema_DDL)

Print the names of all columns

In [25]:
print(blogs_df.columns)

['Id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']


In [28]:
print(blogs_df.schema["Campaigns"].dataType)

ArrayType(StringType(), True)


In [31]:
blogs_df.select(expr("Hits * 2")).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [34]:
blogs_df.select(col("Hits") * 2).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



## Use an expression to compute big hitters for blog. This adds a new column
based on the conditional expression.

In [36]:
blogs_df.withColumn("Big Hitters", (expr("Hits > 5000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|       true|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|       true|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



Concatenate three columns, create a new column, and show the four rows of the newly created column. To see all columns, skip the optional #rows in show().

In [38]:
blogs_df.withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id")))).select(col("AuthorsId")).show()  

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
|MateiZaharia5|
|  ReynoldXin6|
+-------------+



The next three statements are the same, showing that `expr` and `col` are the same method call.

In [39]:
blogs_df.select(expr("Hits")).show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



In [40]:
blogs_df.select(col("Hits")).show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



In [41]:
blogs_df.select("Hits").show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



In [42]:
blogs_df.sort(col("Id").desc()).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [43]:
spark.stop()