In [None]:
!pip install pyspark --quiet

[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[K     |████████████████████████████████| 198 kB 71.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
spark = (SparkSession
       .builder
       .appName("Example-3_6")
       .getOrCreate())

# Two ways to define Schema

In [None]:
from pyspark.sql.types import * 
data = [["author1","title1",500,[1,2,3]],
        ["author2","title2",400,[3,2,1]],
        ["author3","title3",300,[200,100]]
        ]

In [None]:
schema1 = StructType([StructField("author", StringType(), False), 
                     StructField("title", StringType(), False), 
                     StructField("pages", IntegerType(), False),
                     StructField("link", ArrayType(IntegerType()), False)]
                    )

In [None]:
books_df = spark.createDataFrame(data, schema1)
books_df.show()

+-------+------+-----+----------+
| author| title|pages|      link|
+-------+------+-----+----------+
|author1|title1|  500| [1, 2, 3]|
|author2|title2|  400| [3, 2, 1]|
|author3|title3|  300|[200, 100]|
+-------+------+-----+----------+



In [None]:
schema2 = "author STRING, title STRING, pages INT, link ARRAY<INT>"
books_df1 = spark.createDataFrame(data, schema2)
books_df1.show()

+-------+------+-----+----------+
| author| title|pages|      link|
+-------+------+-----+----------+
|author1|title1|  500| [1, 2, 3]|
|author2|title2|  400| [3, 2, 1]|
|author3|title3|  300|[200, 100]|
+-------+------+-----+----------+



In [None]:
 books_df1.printSchema()

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- pages: integer (nullable = true)
 |-- link: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [None]:
books_df1.select(expr("pages") * 2).show(2)
books_df1.select(col("pages") * 2).show(2)
books_df1.select(expr("pages * 2")).show(2)

+-----------+
|(pages * 2)|
+-----------+
|       1000|
|        800|
+-----------+
only showing top 2 rows

+-----------+
|(pages * 2)|
+-----------+
|       1000|
|        800|
+-----------+
only showing top 2 rows

+-----------+
|(pages * 2)|
+-----------+
|       1000|
|        800|
+-----------+
only showing top 2 rows



In [None]:
# show big books
books_df1.withColumn("Big Books", (expr("pages > 400"))).show()
print(books_df1.schema)

+-------+------+-----+----------+---------+
| author| title|pages|      link|Big Books|
+-------+------+-----+----------+---------+
|author1|title1|  500| [1, 2, 3]|     true|
|author2|title2|  400| [3, 2, 1]|    false|
|author3|title3|  300|[200, 100]|    false|
+-------+------+-----+----------+---------+

StructType(List(StructField(author,StringType,true),StructField(title,StringType,true),StructField(pages,IntegerType,true),StructField(link,ArrayType(IntegerType,true),true)))


In [None]:
from pyspark.sql import Row
rows = [Row("author4","title4",500,[1,2,3]), 
        Row("author5","title5",300,[1,2,3]), 
        Row("author6","title6",200,[1,2]), 
        ]
books_df = spark.createDataFrame(rows, schema2)
books_df.show()