In [1]:
# Transform - broadcasting
import pandas as pd

# Let's say we have thermometer measurements but we discover they all
# had an error of measuring 1 degree too low, we can correct like this.

temperatures = pd.DataFrame(data={
  "north": [15, 10, 12, 19, -5],
  "east": [18, 12, 20, 10, 7],
  "south": [23, 22, 12, 27, 11],
  "west": [17, 13, 19, 8, 4],
})

def add1(col):
  return col + 1

temperatures.transform(add1)

Unnamed: 0,north,east,south,west
0,16,19,24,18
1,11,13,23,14
2,13,21,13,20
3,20,11,28,9
4,-4,8,12,5


In [2]:
# Transform - complex
import pandas as pd

# Let's say we have thermometer measurements and we want to know how far off
# the average (for the region) each measurement is.

temperatures = pd.DataFrame(data={
  "north": [15, 10, 12, 19, -5],
  "east": [18, 12, 20, 10, 7],
  "south": [23, 22, 12, 27, 11],
  "west": [17, 13, 19, 8, 4],
})

def deviation_from_average(col):
  average = col.mean()
  return abs(col - average)

temperatures.transform(deviation_from_average)  

Unnamed: 0,north,east,south,west
0,4.8,4.6,4.0,4.8
1,0.2,1.4,3.0,0.8
2,1.8,6.6,7.0,6.8
3,8.8,3.4,8.0,4.2
4,15.2,6.4,8.0,8.2


In [3]:
# Transform - passing a string

import pandas as pd

# Let's say we have thermometer measurements and we want to round them.

temperatures = pd.DataFrame(data={
  "north": [15.12, 10.32, 12.44, 19.01, -5],
  "east": [18, 12.44, 20.65, 10.67, 7.99],
  "south": [23.12, 22.5346, 12.124, 27.2356, 11.19],
  "west": [17.2534, 13.2534, 19.3645, 8.2374, 4.83472],
})

temperatures.transform('round')

Unnamed: 0,north,east,south,west
0,15.0,18.0,23.0,17.0
1,10.0,12.0,23.0,13.0
2,12.0,21.0,12.0,19.0
3,19.0,11.0,27.0,8.0
4,-5.0,8.0,11.0,5.0


In [4]:
import pandas as pd

# Let's say we have thermometer measurements and we want to view the distance to
# the average temperature of the region and the distance to the average
# temperature of all measurements.

temperatures = pd.DataFrame(data={
  "north": [15, 10, 12, 19, -5],
  "east": [18, 12, 20, 10, 7],
  "south": [23, 22, 12, 27, 11],
  "west": [17, 13, 19, 8, 4],
})

def deviation_from_average_per_region(col):
  average = col.mean()
  return abs(col - average)

def deviation_from_average_overall(col):
  # Calculating this over and over again is not very efficient.
  # We could pass it as an argument instead.
  overall_mean = temperatures.mean().mean()
  return abs(col - overall_mean)


temperatures.transform([
  deviation_from_average_per_region,
  deviation_from_average_overall
])

Unnamed: 0_level_0,north,north,east,east,south,south,west,west
Unnamed: 0_level_1,deviation_from_average_per_region,deviation_from_average_overall,deviation_from_average_per_region,deviation_from_average_overall,deviation_from_average_per_region,deviation_from_average_overall,deviation_from_average_per_region,deviation_from_average_overall
0,4.8,1.3,4.6,4.3,4.0,9.3,4.8,3.3
1,0.2,3.7,1.4,1.7,3.0,8.3,0.8,0.7
2,1.8,1.7,6.6,6.3,7.0,1.7,6.8,5.3
3,8.8,5.3,3.4,3.7,8.0,13.3,4.2,5.7
4,15.2,18.7,6.4,6.7,8.0,2.7,8.2,9.7


In [5]:
import pandas as pd

# Let's say we have thermometer measurements. Each row in the dataframe contain
# the measurements of different regions done at the same time.
# We now want to know the distance each measurement has to the average of all
# measurements at the same time (so the same row).
# We also want to know the distance each measurement has to the average of all
# measurements (so the whole dataframe).

temperatures = pd.DataFrame(data={
  "north": [15, 10, 12, 19, -5],
  "east": [18, 12, 20, 10, 7],
  "south": [23, 22, 12, 27, 11],
  "west": [17, 13, 19, 8, 4],
})


def deviation_from_average_per_moment(row):
  average = row.mean()
  return abs(row - average)

def deviation_from_average_overall(row):
  # Calculating this over and over again is not very efficient.
  # We could pass it as an argument instead.
  overall_mean = temperatures.mean().mean()
  return abs(row - overall_mean)

def original(series):
  return series

temperatures.transform([
  original, # To view the original
  deviation_from_average_per_moment,
  deviation_from_average_overall
], axis=1)

Unnamed: 0,Unnamed: 1,north,east,south,west
0,original,15.0,18.0,23.0,17.0
0,deviation_from_average_per_moment,3.25,0.25,4.75,1.25
0,deviation_from_average_overall,1.3,4.3,9.3,3.3
1,original,10.0,12.0,22.0,13.0
1,deviation_from_average_per_moment,4.25,2.25,7.75,1.25
1,deviation_from_average_overall,3.7,1.7,8.3,0.7
2,original,12.0,20.0,12.0,19.0
2,deviation_from_average_per_moment,3.75,4.25,3.75,3.25
2,deviation_from_average_overall,1.7,6.3,1.7,5.3
3,original,19.0,10.0,27.0,8.0


In [6]:
import pandas as pd

# Let's say we have a list of products in a dataframe. We want to capitalize the
# product name and at the same time increase the price with 10% and round it.

sales = pd.DataFrame(data={
  "product_name": ["tofu", "tempeh", "seitan", "jackfruit", "banana flower"],
  "price": [23, 22, 12, 27, 11],
  "barcode": ["ABC", "CDE", "EFG", "GHI", "JKL"]
})

display(sales)

sales.transform({"product_name": str.capitalize, "price": lambda price: round(price * 1.1)})

Unnamed: 0,product_name,price,barcode
0,tofu,23,ABC
1,tempeh,22,CDE
2,seitan,12,EFG
3,jackfruit,27,GHI
4,banana flower,11,JKL


Unnamed: 0,product_name,price
0,Tofu,25
1,Tempeh,24
2,Seitan,13
3,Jackfruit,30
4,Banana flower,12


In [7]:
# Transform - passing a dictionary, multiple transformations per column

import pandas as pd

# Let's say we have a list of products in a dataframe. We want to capitalize the
# product name and at the same time calculate the new price and the price before
# VAT.

sales = pd.DataFrame(data={
  "product_name": ["tofu", "tempeh", "seitan", "jackfruit", "banana flower"],
  "price": [23, 22, 12, 27, 11],
})

display(sales)

def old_price(price):
  return price

def new_price(price):
  return round(price * 1.1)

def price_before_vat(price):
  # VAT == value added tax
  return round(price / 1.19)


sales.transform({"product_name": str.capitalize, "price": [old_price, new_price, price_before_vat]})

Unnamed: 0,product_name,price
0,tofu,23
1,tempeh,22
2,seitan,12
3,jackfruit,27
4,banana flower,11


Unnamed: 0_level_0,product_name,price,price,price
Unnamed: 0_level_1,capitalize,old_price,new_price,price_before_vat
0,Tofu,23,25,19
1,Tempeh,22,24,18
2,Seitan,12,13,10
3,Jackfruit,27,30,23
4,Banana flower,11,12,9
