<div style="font-size:18pt; padding-top:20px; text-align:center"><b>PySpark and </b> <span style="font-weight:bold; color:green">UDF</span></div><hr>
<div style="text-align:right;">Sergei Yu. Papulin <span style="font-style: italic;font-weight: bold;">(papulin_bmstu@mail.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Contents</span>
    <ol>
        <li><a href="#1">Initial dataset</a></li>
        <li><a href="#2">Python UDF</a></li>
        <li><a href="#3">Java UDF in Python</a></li>
        <li><a href="#4">Pure Dataframe</a></li>
        <li><a href="#5">Pandas UDF</a></li>
        <li><a href="#6">Sources</a></li>
    </ol>
</div>

<p>[OPTIONAL] <b>Environment Setup</b></p>

In [None]:
import os
import sys

In [None]:
os.environ["SPARK_HOME"]="/opt/cloudera/parcels/SPARK2/lib/spark2"

In [None]:
os.environ["PYSPARK_PYTHON"]="/opt/rh/rh-python36/root/usr/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"]="/opt/rh/rh-python36/root/usr/bin/python"

In [None]:
spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

<p>Configurations</p>

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
conf = pyspark.SparkConf() \
        .setAppName("udfApp") \
        .setMaster("yarn-client") \
        .set("spark.jars", "/home/cloudera/workspace/CLASS_UDF/lib/*")
        #.set("spark.driver.extraClassPath", "/home/cloudera/workspace/CLASS_UDF/lib/*") \
        #.set("spark.executor.extraClassPath", "/home/cloudera/workspace/CLASS_UDF/lib/*")
        
#pyspark2 --master local[2] --driver-class-path /home/cloudera/workspace/CLASS_UDF/lib/*

<p>New versions (Spark 2+)</p>

In [None]:
spark = SparkSession \
    .builder \
    .appName("udfApp") \
    .config(conf=conf) \
    .getOrCreate()

<p>Older versions</p>

In [None]:
from pyspark.sql import SQLContext

sc = pyspark.SparkContext(conf=conf)
sqlContext = SQLContext(sc)

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Initial dataset</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

<p>Create initial dataset</p>

In [None]:
persons = [[0, "Dima", "Moscow", 1988, 4, "m"],
           [1, "Sveta", "Kiev", 1999, 4, "f"],
           [2, "Alex", "Minsk", 1954, 8, "m"],
           [3, "Ivan", "St.Petersburg", 2005, 6, "m"],
           [4, "Kate", "London", 2001, 3, "f"],
           [5, "Maria", "New York", 1997, 7, "f"]]
persons

<p>RDD API</p>

In [None]:
sc = spark.sparkContext

In [None]:
person_rdd = sc.parallelize(persons)
person_rdd.collect()

<p>Dataframe API</p>

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
import pyspark.sql.functions as F
from pyspark.sql import Row

In [None]:
schema = StructType([StructField(name="Id", dataType=IntegerType(), nullable=False),
                     StructField("Name", StringType(), True),
                     StructField("City", StringType(), True),
                     StructField("Year", IntegerType(), True),
                     StructField("Grade", IntegerType(), True),
                     StructField("Gender", StringType(), True)])

In [None]:
person_df = spark.createDataFrame(person_rdd, schema)
person_df.show()

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">2. Python UDF</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

<p>Function for UDF to convert a numerical grade to a letter</p>

In [None]:
def conver2letter_grade(x):
    return "F" if x < 5 else "A"

<p><b>RDD API</b></p>

In [None]:
person_with_letter_rdd = person_rdd.map(lambda row: conver2letter_grade(row[4]))
person_with_letter_rdd.collect()

In [None]:
person_with_letter_rdd.toDebugString()

<p><b>Dataframe API</b></p>

<p>Create a UDF function for a dataframe</p>

In [None]:
convert2letter_udf = F.udf(lambda x: conver2letter_grade(x), StringType())

<p>Apply the udf to our dataframe </p>

In [None]:
person_with_letter_df = person_df.select(convert2letter_udf(person_df["Grade"]).alias("LetterGrade"))
person_with_letter_df.explain()
person_with_letter_df.show()

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">3. Java UDF in Python</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

<p>Create a new Java project</p>

<p>Add the following modules to your Java project (from Maven)</p>

In [None]:
org.apache.spark:spark-core_2.10
org.apache.spark:spark-sql_2.10

<p>Copy-paste the code of the custom udf</p>

In [None]:
package edu.spark.customsparkudf;

import org.apache.spark.sql.api.java.UDF1;

public class CategorizeValue implements UDF1<Integer, String> {

    private static final long serialVersionUID = 1L;

    @Override
    public String call(Integer value) throws Exception {
        if (value < 5) return "F";
        return "A";
    }

}

<p>Create jar file</p>

<p>This jar file should be distributed across all spark nodes as shown below or as the --jars argument for the spark-submit command</p>

In [None]:
# conf = pyspark.SparkConf() \
#         .setAppName("udfApp") \
#         .setMaster("yarn-client") \
#         .set("spark.jars", "/home/cloudera/workspace/CLASS_UDF/lib/*")

<p>Apply Java UDF inside Python</p>

In [None]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "/home/cloudera/workspace/CLASS_UDF/lib/*").getOrCreate()

In [None]:
spark.udf.registerJavaFunction("categorize", "edu.spark.customsparkudf.CategorizeValue", StringType())

In [None]:
person_with_letter_java_df = person_df.selectExpr("categorize(Grade) as LetterGrade")
person_with_letter_java_df.explain()
person_with_letter_java_df.show()

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">4. Pure Dataframe</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

<p>Create SQL-like expression</p>

In [None]:
def conver2letter_grade_sql(col):
    return F.when(col < 5, "F").otherwise("A")

<p>Apply the function</p>

In [None]:
person_with_letter_df = person_df.select(conver2letter_grade_sql(F.col("Grade")).alias("LetterGrade"))
person_with_letter_df.explain()
person_with_letter_df.show()

<a name="5"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">5. Pandas UDF</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

In [None]:
# TODO

<p>Stop Spark Context</p>

In [None]:
spark.stop()

<a name="6"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">6. Sources</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>