In [None]:
!pip install pyspark --quiet

[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[K     |████████████████████████████████| 198 kB 70.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (SparkSession.builder.appName("Pyspark-2").getOrCreate())

In [None]:
from pyspark.sql.types import *

In [None]:
schema = StructType([
    StructField("author",StringType(),False),
    StructField("title",StringType(),False),
    StructField("pages",IntegerType(),False),
    StructField("links",ArrayType(IntegerType()),False)
]

)

In [None]:
data = [
        ["author1","title1",450,[10,2,5]],
        ["author2","title2",550,[1,2,5]],
        ["author3","title3",350,[8,2,7]],
        ["author1","title4",650,[5,2,50]],
        ["author4","title4",750,[2,7,9]],
]

In [None]:
books_df = spark.createDataFrame(data,schema)
books_df.show(5)

+-------+------+-----+----------+
| author| title|pages|     links|
+-------+------+-----+----------+
|author1|title1|  450|[10, 2, 5]|
|author2|title2|  550| [1, 2, 5]|
|author3|title3|  350| [8, 2, 7]|
|author1|title4|  650|[5, 2, 50]|
|author4|title4|  750| [2, 7, 9]|
+-------+------+-----+----------+



In [None]:
schema = "author STRING, title STRING, pages INT, links ARRAY<INT>"
books_df = spark.createDataFrame(data,schema)
books_df.show(5)

+-------+------+-----+----------+
| author| title|pages|     links|
+-------+------+-----+----------+
|author1|title1|  450|[10, 2, 5]|
|author2|title2|  550| [1, 2, 5]|
|author3|title3|  350| [8, 2, 7]|
|author1|title4|  650|[5, 2, 50]|
|author4|title4|  750| [2, 7, 9]|
+-------+------+-----+----------+



In [None]:
books_df.createOrReplaceTempView("books_df")

In [None]:
books_df.select("author","title","pages",(expr("pages * 2")).alias("calculated")).show(3)

+-------+------+-----+----------+
| author| title|pages|calculated|
+-------+------+-----+----------+
|author1|title1|  450|       900|
|author2|title2|  550|      1100|
|author3|title3|  350|       700|
+-------+------+-----+----------+
only showing top 3 rows



In [None]:
books_df.withColumn("Big Books",expr("pages > 500")).show()

+-------+------+-----+----------+---------+
| author| title|pages|     links|Big Books|
+-------+------+-----+----------+---------+
|author1|title1|  450|[10, 2, 5]|    false|
|author2|title2|  550| [1, 2, 5]|     true|
|author3|title3|  350| [8, 2, 7]|    false|
|author1|title4|  650|[5, 2, 50]|     true|
|author4|title4|  750| [2, 7, 9]|     true|
+-------+------+-----+----------+---------+



In [None]:
from pyspark.sql import Row
rows = [
        Row("author-x","title-x",500,[1,2,3]),
        Row("author-y","title-y",520,[1,5,3]),
]

books_df = spark.createDataFrame(rows,schema)
books_df.show()

# Creating UDF

In [None]:
def am_sum(s):
  return int(s)+2

spark.udf.register("am_sum",am_sum,LongType())

<function __main__.am_sum>

In [None]:
spark.sql("""
select author,title,pages,am_sum(pages) as calculated
from books_df
""").show()

+-------+------+-----+----------+
| author| title|pages|calculated|
+-------+------+-----+----------+
|author1|title1|  450|       452|
|author2|title2|  550|       552|
|author3|title3|  350|       352|
|author1|title4|  650|       652|
|author4|title4|  750|       752|
+-------+------+-----+----------+



# Pandas UDF

In [None]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

In [None]:
def am_sum(s:pd.Series)->pd.Series:
  return s.apply(lambda x: int(x)+2)

am_sum_udf = pandas_udf(am_sum, returnType=LongType())

In [None]:
books_df.select("author","title","pages",am_sum_udf(col("pages"))).show()

+-------+------+-----+-------------+
| author| title|pages|am_sum(pages)|
+-------+------+-----+-------------+
|author1|title1|  450|          452|
|author2|title2|  550|          552|
|author3|title3|  350|          352|
|author1|title4|  650|          652|
|author4|title4|  750|          752|
+-------+------+-----+-------------+



# Higher Order Functions - transform, filter, exists, reduce

In [None]:
data = [
        ["author1","title1",450,[10,2,5]],
        ["author2","title2",550,[1,2,5]],
        ["author3","title3",350,[8,2,7]],
        ["author1","title4",650,[5,2,50]],
        ["author4","title4",750,[2,7,9]],
]

schema = "author STRING, title STRING, pages INT, links ARRAY<INT>"

books_df = spark.createDataFrame(data,schema)
books_df.show(5)

+-------+------+-----+----------+
| author| title|pages|     links|
+-------+------+-----+----------+
|author1|title1|  450|[10, 2, 5]|
|author2|title2|  550| [1, 2, 5]|
|author3|title3|  350| [8, 2, 7]|
|author1|title4|  650|[5, 2, 50]|
|author4|title4|  750| [2, 7, 9]|
+-------+------+-----+----------+



In [None]:
books_df.createOrReplaceTempView("books_df")

In [None]:
spark.sql("""
select author,links, transform(links,t->(t*2)) as transform
from books_df
""").show()

+-------+----------+------------+
| author|     links|   transform|
+-------+----------+------------+
|author1|[10, 2, 5]| [20, 4, 10]|
|author2| [1, 2, 5]|  [2, 4, 10]|
|author3| [8, 2, 7]| [16, 4, 14]|
|author1|[5, 2, 50]|[10, 4, 100]|
|author4| [2, 7, 9]| [4, 14, 18]|
+-------+----------+------------+



In [None]:
spark.sql("""
select author,links, explode(links) as link
from books_df
""").show()

+-------+----------+----+
| author|     links|link|
+-------+----------+----+
|author1|[10, 2, 5]|  10|
|author1|[10, 2, 5]|   2|
|author1|[10, 2, 5]|   5|
|author2| [1, 2, 5]|   1|
|author2| [1, 2, 5]|   2|
|author2| [1, 2, 5]|   5|
|author3| [8, 2, 7]|   8|
|author3| [8, 2, 7]|   2|
|author3| [8, 2, 7]|   7|
|author1|[5, 2, 50]|   5|
|author1|[5, 2, 50]|   2|
|author1|[5, 2, 50]|  50|
|author4| [2, 7, 9]|   2|
|author4| [2, 7, 9]|   7|
|author4| [2, 7, 9]|   9|
+-------+----------+----+

