In [0]:
import pandas as pd

from pyspark.sql.functions import pandas_udf, col, PandasUDFType
from pyspark.sql.types import IntegerType

from typing import Iterator, Tuple

In [0]:
superstore_df = spark.sql('select * from superstore_data')

superstore_df.display()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,3,0.0,219.582
3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal,14.62,2,0.0,6.8714
4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164
6,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,FUR-FU-10001487,Furniture,Furnishings,"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,7,0.0,14.1694
7,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28,4,0.0,1.9656
8,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152,6,0.2,90.7152
9,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by Samsill,18.504,3,0.2,5.7825
10,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9,5,0.0,34.47


In [0]:
# Series to series

In [0]:
def year(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.year
  
year_pandas = pandas_udf(year, returnType=IntegerType())

In [0]:
superstore_df.withColumn('Year', year_pandas(col('Order Date')))\
             .select('Order Date', 'Category', 'Year').display()

Order Date,Category,Year
11/8/2016,Furniture,2016
11/8/2016,Furniture,2016
6/12/2016,Office Supplies,2016
10/11/2015,Furniture,2015
10/11/2015,Office Supplies,2015
6/9/2014,Furniture,2014
6/9/2014,Office Supplies,2014
6/9/2014,Technology,2014
6/9/2014,Office Supplies,2014
6/9/2014,Office Supplies,2014


In [0]:
@pandas_udf('integer', PandasUDFType.SCALAR)
def year(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.year

@pandas_udf('integer', PandasUDFType.SCALAR)
def month(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.month

@pandas_udf('integer', PandasUDFType.SCALAR)
def day(date: pd.Series) -> pd.Series:
  return pd.to_datetime(date).dt.day

In [0]:
superstore_df.withColumn('Year', year(col('Order Date')))\
             .withColumn('Month', month(col('Order Date')))\
             .withColumn('Day', day(col('Order Date')))\
             .select('Category', 'Order Date', 'Year', 'Month', 'Day').display()

Category,Order Date,Year,Month,Day
Furniture,11/8/2016,2016,11,8
Furniture,11/8/2016,2016,11,8
Office Supplies,6/12/2016,2016,6,12
Furniture,10/11/2015,2015,10,11
Office Supplies,10/11/2015,2015,10,11
Furniture,6/9/2014,2014,6,9
Office Supplies,6/9/2014,2014,6,9
Technology,6/9/2014,2014,6,9
Office Supplies,6/9/2014,2014,6,9
Office Supplies,6/9/2014,2014,6,9


In [0]:
@pandas_udf('first string, last string')
def firstname_lastname(name: pd.Series) -> pd.Series:
  return name.str.split(expand=True)

In [0]:
superstore_df.select(firstname_lastname('Customer Name')).display()

firstname_lastname(Customer Name)
"List(Claire, Gute)"
"List(Claire, Gute)"
"List(Darrin, Van)"
"List(Sean, O'Donnell)"
"List(Sean, O'Donnell)"
"List(Brosina, Hoffman)"
"List(Brosina, Hoffman)"
"List(Brosina, Hoffman)"
"List(Brosina, Hoffman)"
"List(Brosina, Hoffman)"


In [0]:
# Iterator of series to iterator of series

In [0]:
def expensive_operation_to_compute_discount():
  # Assume this makes an expensive REST API call
  return 0.05

In [0]:
@pandas_udf("float")
def compute_discounted_sales_price(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    
    discount = expensive_operation_to_compute_discount()
    
    for sales_price in iterator:
        yield sales_price - discount * sales_price

In [0]:
superstore_df.select('Product Name', 'Sales', 
                     compute_discounted_sales_price('Sales').alias('Discounted Sales')).display()

Product Name,Sales,Discounted Sales
Bush Somerset Collection Bookcase,261.96,248.862
"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,695.343
Self-Adhesive Address Labels for Typewriters by Universal,14.62,13.889
Bretford CR4500 Series Slim Rectangular Table,957.5775,909.6986
Eldon Fold 'N Roll Cart System,22.368,21.2496
"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,46.417
Newell 322,7.28,6.916
Mitel 5320 IP Phone VoIP phone,907.152,861.7944
DXL Angle-View Binders with Locking Rings by Samsill,18.504,17.5788
Belkin F5C206VTEL 6 Outlet Surge,114.9,109.155


In [0]:
# Iterator of multiple series to iterator of series

In [0]:
@pandas_udf("string")
def combine_city_state(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:    
    for city, state in iterator:
        yield city + ' (' + state + ')'

In [0]:
superstore_df.select('City', 'State', combine_city_state('City', 'State').alias('City (State)')).display()

City,State,City (State)
Henderson,Kentucky,Henderson(Kentucky)
Henderson,Kentucky,Henderson(Kentucky)
Los Angeles,California,Los Angeles(California)
Fort Lauderdale,Florida,Fort Lauderdale(Florida)
Fort Lauderdale,Florida,Fort Lauderdale(Florida)
Los Angeles,California,Los Angeles(California)
Los Angeles,California,Los Angeles(California)
Los Angeles,California,Los Angeles(California)
Los Angeles,California,Los Angeles(California)
Los Angeles,California,Los Angeles(California)


In [0]:
@pandas_udf("float")
def compute_cost(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:    
    for sales, profit in iterator:
        yield sales - profit

In [0]:
superstore_df.select('Product Name', 'Sales', 'Profit', compute_cost('Sales', 'Profit').alias('Cost')).display()

Product Name,Sales,Profit,Cost
Bush Somerset Collection Bookcase,261.96,41.9136,220.0464
"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,219.582,512.358
Self-Adhesive Address Labels for Typewriters by Universal,14.62,6.8714,7.7486
Bretford CR4500 Series Slim Rectangular Table,957.5775,-383.031,1340.6085
Eldon Fold 'N Roll Cart System,22.368,2.5164,19.8516
"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,14.1694,34.6906
Newell 322,7.28,1.9656,5.3144
Mitel 5320 IP Phone VoIP phone,907.152,90.7152,816.4368
DXL Angle-View Binders with Locking Rings by Samsill,18.504,5.7825,12.7215
Belkin F5C206VTEL 6 Outlet Surge,114.9,34.47,80.43


In [0]:
spark.udf.register('compute_cost', compute_cost)

In [0]:
spark.sql("""select `Product Name`, Sales, Profit, compute_cost(Sales, Profit) as Cost 
             from superstore_data""").display()

Product Name,Sales,Profit,Cost
Bush Somerset Collection Bookcase,261.96,41.9136,220.0464
"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,219.582,512.358
Self-Adhesive Address Labels for Typewriters by Universal,14.62,6.8714,7.7486
Bretford CR4500 Series Slim Rectangular Table,957.5775,-383.031,1340.6085
Eldon Fold 'N Roll Cart System,22.368,2.5164,19.8516
"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,14.1694,34.6906
Newell 322,7.28,1.9656,5.3144
Mitel 5320 IP Phone VoIP phone,907.152,90.7152,816.4368
DXL Angle-View Binders with Locking Rings by Samsill,18.504,5.7825,12.7215
Belkin F5C206VTEL 6 Outlet Surge,114.9,34.47,80.43


In [0]:
# Series to scalar

In [0]:
@pandas_udf('float')
def average(values: pd.Series) -> float:
  return values.mean()

In [0]:
superstore_df.select(average('Sales').alias('Average Sales')).display()

Average Sales
229.858


In [0]:
@pandas_udf('float')
def median(values: pd.Series) -> float:
  return values.median()

In [0]:
superstore_df.select(median('Sales').alias('Median Sales')).display()

Median Sales
54.49


In [0]:
spark.udf.register('average', average)

In [0]:
spark.sql('select average(profit) from superstore_data').display()

average(profit)
28.656897


In [0]:
spark.sql('select category, average(profit) from superstore_data group by category').display()

category,average(profit)
Furniture,8.699327
Office Supplies,20.32705
Technology,78.752


In [0]:
order_profit_df = spark.sql('select State, Profit from superstore_data')

order_profit_df.display()

State,Profit
Kentucky,41.9136
Kentucky,219.582
California,6.8714
Florida,-383.031
Florida,2.5164
California,14.1694
California,1.9656
California,90.7152
California,5.7825
California,34.47


In [0]:
@pandas_udf('State string, Profit float, Normalized_Profit float', PandasUDFType.GROUPED_MAP)
def subtract_mean(df: pd.DataFrame) -> pd.DataFrame:
  return df.assign(Normalized_Profit=df.Profit - df.Profit.mean())

In [0]:
order_profit_df.groupby('State').apply(subtract_mean).display()

State,Profit,Normalized_Profit
Alabama,56.2032,-38.66279
Alabama,8.0352,-86.83079
Alabama,274.386,179.52
Alabama,11.375,-83.49099
Alabama,5.616,-89.249985
Alabama,74.8524,-20.013588
Alabama,1.9629,-92.90309
Alabama,163.7874,68.92141
Alabama,34.7802,-60.08579
Alabama,45.84,-49.02599
