In [0]:
%run ../Includes/Copy-Datasets

Data catalog: workspace
Schema: bookstore_eng_pro


In [0]:
df_books = spark.table("books_silver")
display(df_books)

book_id,title,author,price,current,effective_date,end_date
B12,Big Data in Practice,Bernard Marr,30.0,True,2021-11-10T16:36:31.241Z,
B11,Business Intelligence for Dummies,Swain Scheps,38.0,True,2021-11-10T16:36:31.241Z,
B10,Beginning Database Design Solutions,Rod Stephens,44.0,True,2021-11-10T16:36:31.241Z,
B09,Advanced Data Structures,Peter Brass,24.0,True,2021-11-09T17:11:49.506Z,
B08,Quantum Computing for Everyone,Chris Bernhardt,41.0,True,2021-11-09T17:11:49.506Z,
B07,The Hundred-Page Machine Learning,Andriy Burkov,33.0,True,2021-11-09T17:11:49.506Z,
B06,Deep Learning with Python,François Chollet,22.0,True,2021-11-08T17:12:05.419Z,
B05,Fluent Python,Luciano Ramalho,47.0,True,2021-11-08T17:12:05.419Z,
B04,Robot Dynamics and Control,Mark W. Spong,20.0,True,2021-11-08T17:12:05.419Z,
B03,Make Your Own Neural Network,Tariq Rashid,35.0,True,2021-11-07T17:11:33.507Z,


In [0]:
def apply_discount(price, percentage):
    return price * (1 - percentage/100)

In [0]:
apply_discount(100, 20)

80.0

In [0]:
apply_discount_udf = udf(apply_discount)

In [0]:
from pyspark.sql.functions import col, lit

df_discounts = df_books.select("price", apply_discount_udf(col("price"), lit(50)))
display(df_discounts)

price,"apply_discount(price, 50)"
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5


In [0]:
apply_discount_py_udf = spark.udf.register("apply_discount_sql_udf", apply_discount)

In [0]:
df_discounts = df_books.select("price", apply_discount_py_udf(col("price"), lit(50)))
display(df_discounts)

price,"apply_discount_sql_udf(price, 50)"
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5


In [0]:
%sql
SELECT price, apply_discount_sql_udf(price, 50) AS price_after_discount
FROM books_silver

price,price_after_discount
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5


In [0]:
@udf("double")
def apply_discount_decorator_udf(price, percentage):
    return price * (1 - percentage/100)

In [0]:
#apply_discount_decorator_udf(100, 20)

In [0]:
df_discounts = df_books.select("price", apply_discount_decorator_udf(col("price"), lit(50)))
display(df_discounts)

price,"apply_discount_decorator_udf(price, 50)"
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5


In [0]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

def vectorized_udf(price: pd.Series, percentage: pd.Series,) -> pd.Series:
    return price * (1 - percentage/100)

vectorized_udf = pandas_udf(vectorized_udf, "double")

In [0]:
@pandas_udf("double")
def vectorized_udf(price: pd.Series, percentage: pd.Series,) -> pd.Series:
    return price * (1 - percentage/100)

In [0]:
df_domains = df_books.select("price", vectorized_udf(col("price"), lit(50)))
display(df_domains)

price,"vectorized_udf(price, 50)"
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5


In [0]:
spark.udf.register("sql_vectorized_udf", vectorized_udf)

<function __main__.vectorized_udf(price: pandas.core.series.Series, percentage: pandas.core.series.Series) -> pandas.core.series.Series>

In [0]:
%sql
SELECT price, sql_vectorized_udf(price, 50) AS price_after_discount
FROM books_silver

price,price_after_discount
30.0,15.0
38.0,19.0
44.0,22.0
24.0,12.0
41.0,20.5
33.0,16.5
22.0,11.0
47.0,23.5
20.0,10.0
35.0,17.5
