In [0]:
import pandas as pd

from pyspark.sql.functions import pandas_udf, col, PandasUDFType
from pyspark.sql.types import IntegerType

from typing import Iterator, Tuple

In [0]:
superstore_df = spark.sql('select * from superstore_data')

superstore_df.display()

In [0]:
# Series to series

In [0]:
def year(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.year
  
year_pandas = pandas_udf(year, returnType=IntegerType())

In [0]:
superstore_df.withColumn('Year', year_pandas(col('Order Date')))\
             .select('Order Date', 'Category', 'Year').display()

In [0]:
@pandas_udf('integer', PandasUDFType.SCALAR)
def year(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.year

@pandas_udf('integer', PandasUDFType.SCALAR)
def month(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.month

@pandas_udf('integer', PandasUDFType.SCALAR)
def day(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.day

In [0]:
superstore_df.withColumn('Year', year(col('Order Date')))\
             .withColumn('Month', month(col('Order Date')))\
             .withColumn('Day', day(col('Order Date')))\
             .select('Category', 'Order Date', 'Year', 'Month', 'Day').display()

In [0]:
@pandas_udf('first string, last string')
def firstname_lastname(name: pd.Series) -> pd.Series:
  return name.str.split(expand=True)

In [0]:
superstore_df.select(firstname_lastname('Customer Name')).display()

In [0]:
# Iterator of series to iterator of series

In [0]:
def expensive_operation_to_compute_discount():
  # Assume this makes an expensive REST API call
  return 0.05

In [0]:
@pandas_udf("float")
def compute_discounted_sales_price(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    
    discount = expensive_operation_to_compute_discount()
    
    for sales_price in iterator:
        yield sales_price - discount * sales_price

In [0]:
superstore_df.select('Product Name', 'Sales', 
                     compute_discounted_sales_price('Sales').alias('Discounted Sales')).display()

In [0]:
# Iterator of multiple series to iterator of series

In [0]:
@pandas_udf("string")
def combine_city_state(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:    
    for city, state in iterator:
        yield city + ' (' + state + ')'

In [0]:
superstore_df.select('City', 'State', combine_city_state('City', 'State').alias('City (State)')).display()

In [0]:
@pandas_udf("float")
def compute_cost(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:    
    for sales, profit in iterator:
        yield sales - profit

In [0]:
superstore_df.select('Product Name', 'Sales', 'Profit', compute_cost('Sales', 'Profit').alias('Cost')).display()

In [0]:
spark.udf.register('compute_cost', compute_cost)

In [0]:
spark.sql("""select `Product Name`, Sales, Profit, compute_cost(Sales, Profit) as Cost 
             from superstore_data""").display()

In [0]:
# Series to scalar

In [0]:
@pandas_udf('float')
def average(values: pd.Series) -> float:
  return values.mean()

In [0]:
superstore_df.select(average('Sales').alias('Average Sales')).display()

In [0]:
@pandas_udf('float')
def median(values: pd.Series) -> float:
  return values.median()

In [0]:
superstore_df.select(median('Sales').alias('Median Sales')).display()

In [0]:
spark.udf.register('average', average)

In [0]:
spark.sql('select average(profit) from superstore_data').display()

In [0]:
spark.sql('select category, average(profit) from superstore_data group by category').display()

In [0]:
order_profit_df = spark.sql('select State, Profit from superstore_data')

order_profit_df.display()

In [0]:
@pandas_udf('State string, Profit float, Normalized_Profit float', PandasUDFType.GROUPED_MAP)
def subtract_mean(df: pd.DataFrame) -> pd.DataFrame:
  return df.assign(Normalized_Profit=df.Profit - df.Profit.mean())

In [0]:
order_profit_df.groupby('State').apply(subtract_mean).display()