In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/wsgen to provide /usr/bin/wsgen (wsgen) in auto mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/wsimport to provide /usr/bin/wsimport (wsimport) in auto mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/xjc to provide /usr/bin/xjc (xjc) in auto mode
Processing triggers for libc-bin (2.35-0ubuntu3.4) ...
/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link



In [2]:
import pyspark
import pyspark.sql  as pyspark_sql
import pyspark.sql.types as pyspark_types
import pyspark.sql.functions  as F
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import row_number, desc

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = pyspark_sql.SparkSession.builder.getOrCreate()

# User Defined Function (UDF) in PySpark



In [7]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

# Creating a dummy dataset
data = [
    ('John', 32, 'New York', 70000.0),
    ('Jane', 28, 'Los Angeles', 65000.0),
    ('Mike', 45, 'Chicago', 80000.0),
    ('Emily', 35, 'New York', 75000.0),
    ('David', 29, 'San Francisco', 90000.0),
    ('Sarah', 41, 'Chicago', 85000.0)
]

columns = ['name', 'age', 'city', 'salary']
df = spark.createDataFrame(data, columns)
df.show()

+-----+---+-------------+-------+
| name|age|         city| salary|
+-----+---+-------------+-------+
| John| 32|     New York|70000.0|
| Jane| 28|  Los Angeles|65000.0|
| Mike| 45|      Chicago|80000.0|
|Emily| 35|     New York|75000.0|
|David| 29|San Francisco|90000.0|
|Sarah| 41|      Chicago|85000.0|
+-----+---+-------------+-------+



In [8]:
# Create a UDF to calculate the tax based on the salary
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def calculate_tax(salary):
    if salary <= 50000:
        return salary * 0.1
    elif salary > 50000 and salary <= 75000:
        return salary * 0.15
    else:
        return salary * 0.2

calculate_tax_udf = udf(calculate_tax, DoubleType())
df = df.withColumn('tax', calculate_tax_udf('salary'))

df.show()

+-----+---+-------------+-------+-------+
| name|age|         city| salary|    tax|
+-----+---+-------------+-------+-------+
| John| 32|     New York|70000.0|10500.0|
| Jane| 28|  Los Angeles|65000.0| 9750.0|
| Mike| 45|      Chicago|80000.0|16000.0|
|Emily| 35|     New York|75000.0|11250.0|
|David| 29|San Francisco|90000.0|18000.0|
|Sarah| 41|      Chicago|85000.0|17000.0|
+-----+---+-------------+-------+-------+



In [5]:
# Using UDF with PySpark DataFrame withColumn()
def upperCase(str):
    return str.upper()

upperCaseUDF = udf(lambda z:upperCase(z),StringType())

df.withColumn("Cureated Name", upperCaseUDF(col("Name"))) \
.show(truncate=False)

+-----+------------+-------------+
|Seqno|Name        |Cureated Name|
+-----+------------+-------------+
|1    |john jones  |JOHN JONES   |
|2    |tracey smith|TRACEY SMITH |
|3    |amy sanders |AMY SANDERS  |
+-----+------------+-------------+



In [9]:
# Create a UDF to capitalize the first letter of each name
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def capitalize_name(name):
    return name.title()

capitalize_name_udf = udf(capitalize_name, StringType())
df = df.withColumn('capitalized_name', capitalize_name_udf('name'))
df.show()

+-----+---+-------------+-------+-------+----------------+
| name|age|         city| salary|    tax|capitalized_name|
+-----+---+-------------+-------+-------+----------------+
| John| 32|     New York|70000.0|10500.0|            John|
| Jane| 28|  Los Angeles|65000.0| 9750.0|            Jane|
| Mike| 45|      Chicago|80000.0|16000.0|            Mike|
|Emily| 35|     New York|75000.0|11250.0|           Emily|
|David| 29|San Francisco|90000.0|18000.0|           David|
|Sarah| 41|      Chicago|85000.0|17000.0|           Sarah|
+-----+---+-------------+-------+-------+----------------+



In [10]:
# Create a UDF to calculate the age group based on the age
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def calculate_age_group(age):
    if age <= 30:
        return 'Young'
    elif age > 30 and age <= 40:
        return 'Adult'
    else:
        return 'Senior'

calculate_age_group_udf = udf(calculate_age_group, StringType())
df = df.withColumn('age_group', calculate_age_group_udf('age'))
df.show()

+-----+---+-------------+-------+-------+----------------+---------+
| name|age|         city| salary|    tax|capitalized_name|age_group|
+-----+---+-------------+-------+-------+----------------+---------+
| John| 32|     New York|70000.0|10500.0|            John|    Adult|
| Jane| 28|  Los Angeles|65000.0| 9750.0|            Jane|    Young|
| Mike| 45|      Chicago|80000.0|16000.0|            Mike|   Senior|
|Emily| 35|     New York|75000.0|11250.0|           Emily|    Adult|
|David| 29|San Francisco|90000.0|18000.0|           David|    Young|
|Sarah| 41|      Chicago|85000.0|17000.0|           Sarah|   Senior|
+-----+---+-------------+-------+-------+----------------+---------+

