In [65]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, expr, concat


In [26]:
schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('First', StringType(), False),
    StructField('Last', StringType(), False),
    StructField('Url', StringType(), False),
    StructField('Published', StringType(), False),
    StructField('Hits', IntegerType(), False),
    StructField('Campaigns', ArrayType(StringType()), False),
])

In [27]:
schema_2 = "`id` INT, `First` STRING, `LAST` STRING, `URL` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

In [28]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [29]:
spark = (SparkSession
         .builder
         .appName('Schema_1')
         .getOrCreate())

In [30]:
blogs_df = spark.createDataFrame(data, schema)

In [47]:
blogs_df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [31]:
blogs_df.collect()

[Row(id=1, First='Jules', Last='Damji', Url='https://tinyurl.1', Published='1/4/2016', Hits=4535, Campaigns=['twitter', 'LinkedIn']),
 Row(id=2, First='Brooke', Last='Wenig', Url='https://tinyurl.2', Published='5/5/2018', Hits=8908, Campaigns=['twitter', 'LinkedIn']),
 Row(id=3, First='Denny', Last='Lee', Url='https://tinyurl.3', Published='6/7/2019', Hits=7659, Campaigns=['web', 'twitter', 'FB', 'LinkedIn']),
 Row(id=4, First='Tathagata', Last='Das', Url='https://tinyurl.4', Published='5/12/2018', Hits=10568, Campaigns=['twitter', 'FB']),
 Row(id=5, First='Matei', Last='Zaharia', Url='https://tinyurl.5', Published='5/14/2014', Hits=40578, Campaigns=['web', 'twitter', 'FB', 'LinkedIn']),
 Row(id=6, First='Reynold', Last='Xin', Url='https://tinyurl.6', Published='3/2/2015', Hits=25568, Campaigns=['twitter', 'LinkedIn'])]

In [32]:
blogs_2_df = spark.createDataFrame(data, schema_2)

In [48]:
blogs_2_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- LAST: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [33]:
blogs_2_df.collect()

[Row(id=1, First='Jules', LAST='Damji', URL='https://tinyurl.1', Published='1/4/2016', Hits=4535, Campaigns=['twitter', 'LinkedIn']),
 Row(id=2, First='Brooke', LAST='Wenig', URL='https://tinyurl.2', Published='5/5/2018', Hits=8908, Campaigns=['twitter', 'LinkedIn']),
 Row(id=3, First='Denny', LAST='Lee', URL='https://tinyurl.3', Published='6/7/2019', Hits=7659, Campaigns=['web', 'twitter', 'FB', 'LinkedIn']),
 Row(id=4, First='Tathagata', LAST='Das', URL='https://tinyurl.4', Published='5/12/2018', Hits=10568, Campaigns=['twitter', 'FB']),
 Row(id=5, First='Matei', LAST='Zaharia', URL='https://tinyurl.5', Published='5/14/2014', Hits=40578, Campaigns=['web', 'twitter', 'FB', 'LinkedIn']),
 Row(id=6, First='Reynold', LAST='Xin', URL='https://tinyurl.6', Published='3/2/2015', Hits=25568, Campaigns=['twitter', 'LinkedIn'])]

In [34]:
blogs_df.columns

['id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

In [44]:
blogs_df['id']

Column<'id'>

In [54]:
blogs_df.select(col('Hits') * 2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [52]:
blogs_df.select(expr('Hits') * 2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [55]:
blogs_df.select(expr('Hits * 2')).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [58]:
blogs_df.withColumn('Big Hitters', expr('Hits > 10000')).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [60]:
blogs_df.withColumn('AuthorsId', (concat(expr('First'), expr('Last'), expr('Id')))).select(col('AuthorsId')).show()

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
|MateiZaharia5|
|  ReynoldXin6|
+-------------+



In [62]:
blogs_df.sort(col('Id').desc()).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [64]:
blogs_df.sort(expr('Id').desc()).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [66]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ['Author', 'State'])

In [67]:
authors_df.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+

