# Chapter 5: Spark SQL and DataFrames: Interacting with External Data Sources

In [39]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, pandas_udf

In [2]:
spark = (SparkSession
      .builder
      .appName("SparkSQLExampleApp")
      .getOrCreate())

22/04/21 16:02:31 WARN Utils: Your hostname, Zipcoders-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.77 instead (on interface en0)
22/04/21 16:02:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/21 16:02:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark SQL UDFs

In [7]:
 # Create cubed function
def cubed(s): 
    return s*s*s

In [8]:
# Register UDF
spark.udf.register("cubed", cubed, LongType())

22/04/21 16:03:04 WARN SimpleFunctionRegistry: The function cubed replaced a previously registered function.


<function __main__.cubed(s)>

In [9]:
# Generate temporary view
spark.range(1, 9).createOrReplaceTempView("udf_test")

In [10]:
# Query the cubed UDF
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

+---+--------+
| id|id_cubed|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
|  5|     125|
|  6|     216|
|  7|     343|
|  8|     512|
+---+--------+



                                                                                

In [27]:
# scalar Pandas UDF for Spark 3.0
def cubed(a: pd.Series) -> pd.Series: 
    return a*a*a

In [34]:
# Create the pandas UDF for the cubed function
# cubed_udf = pandas_udf(cubed, returnType=LongType())

In [29]:
# Create a Pandas Series
x = pd.Series([1, 2, 3])

In [30]:
# The function for a pandas_udf executed with local Pandas data
print(cubed(x))

0     1
1     8
2    27
dtype: int64


In [35]:
# Create a Spark DataFrame, 'spark' is an existing SparkSession
#df = spark.range(1, 4)

In [36]:
# Execute function as a Spark vectorized UDF
#df.select("id", cubed_udf(col("id"))).show()

In [None]:
# Higher-Order Functions

In [40]:
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

In [41]:
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]

In [42]:
t_c = spark.createDataFrame(t_list, schema)

In [43]:
t_c.createOrReplaceTempView("tC")

In [44]:
# Show the DataFrame
t_c.show()

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



transform()
The transform() function produces an array by applying a function to each element
of the input array (similar to a map() function):

In [45]:
spark.sql("""
SELECT celsius,
transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
      FROM tC
""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



filter()
The filter() function produces an array consisting of only the elements of the input
array for which the Boolean function is true: