In [1]:
import warnings

warnings.filterwarnings('ignore')

# 2 pandas中的transform

## 2.1 transform作用于Series

In [2]:
import pandas as pd
import numpy as np

# 读入数据并去除含有缺失值的行
penguins = pd.read_csv('penguins.csv')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


- **单个变换函数**

In [3]:
# 对数化
penguins['bill_length_mm'].transform(np.log)

0      3.666122
1      3.676301
2      3.696351
3           NaN
4      3.602777
         ...   
339    4.021774
340    3.772761
341    3.903991
342    3.927896
343    3.916015
Name: bill_length_mm, Length: 344, dtype: float64

In [4]:
# lambda函数
penguins['bill_length_mm'].transform(lambda s: s+1)

0      40.1
1      40.5
2      41.3
3       NaN
4      37.7
       ... 
339    56.8
340    44.5
341    50.6
342    51.8
343    51.2
Name: bill_length_mm, Length: 344, dtype: float64

- **多个变换函数**

In [5]:
penguins['bill_length_mm'].transform([np.log, 
                                      lambda s: s+1, 
                                      np.sqrt])

Unnamed: 0,log,<lambda>,sqrt
0,3.666122,40.1,6.252999
1,3.676301,40.5,6.284903
2,3.696351,41.3,6.348228
3,,,
4,3.602777,37.7,6.058052
...,...,...,...
339,4.021774,56.8,7.469940
340,3.772761,44.5,6.595453
341,3.903991,50.6,7.042727
342,3.927896,51.8,7.127412


In [6]:
# 利用transform进行数据标准化
penguins['bill_length_mm'].transform(lambda s: (s - s.mean()) / s.std())

0     -0.883205
1     -0.809939
2     -0.663408
3           NaN
4     -1.322799
         ...   
339    2.175637
340   -0.077282
341    1.040019
342    1.259816
343    1.149917
Name: bill_length_mm, Length: 344, dtype: float64

## 2.2 transform作用于DataFrame

In [7]:
# 分别对每列进行标准化
(
    penguins
    .loc[:, 'bill_length_mm': 'body_mass_g']
    .transform(lambda s: (s - s.mean()) / s.std())
)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,-0.883205,0.784300,-1.416272,-0.563317
1,-0.809939,0.126003,-1.060696,-0.500969
2,-0.663408,0.429833,-0.420660,-1.186793
3,,,,
4,-1.322799,1.088129,-0.562890,-0.937403
...,...,...,...,...
339,2.175637,1.341320,0.432721,-0.251578
340,-0.077282,0.480471,0.077145,-0.999750
341,1.040019,0.531109,-0.562890,-0.532143
342,1.259816,0.936215,0.646066,-0.126883


In [8]:
(
    penguins
    .loc[:, 'bill_length_mm': 'body_mass_g']
    .transform([np.log, lambda s: s+1])
)

Unnamed: 0_level_0,bill_length_mm,bill_length_mm,bill_depth_mm,bill_depth_mm,flipper_length_mm,flipper_length_mm,body_mass_g,body_mass_g
Unnamed: 0_level_1,log,<lambda_0>,log,<lambda_0>,log,<lambda_0>,log,<lambda_0>
0,3.666122,40.1,2.928524,19.7,5.198497,182.0,8.229511,3751.0
1,3.676301,40.5,2.856470,18.4,5.225747,187.0,8.242756,3801.0
2,3.696351,41.3,2.890372,19.0,5.273000,196.0,8.086410,3251.0
3,,,,,,,,
4,3.602777,37.7,2.960105,20.3,5.262690,194.0,8.146130,3451.0
...,...,...,...,...,...,...,...,...
339,4.021774,56.8,2.985682,20.8,5.332719,208.0,8.294050,4001.0
340,3.772761,44.5,2.895912,19.1,5.308268,203.0,8.131531,3401.0
341,3.903991,50.6,2.901422,19.2,5.262690,194.0,8.236156,3776.0
342,3.927896,51.8,2.944439,20.0,5.347108,211.0,8.318742,4101.0


In [9]:
# 根据字典为不同的列配置不同的变换函数
(
    penguins
    .loc[:, 'bill_length_mm': 'body_mass_g']
    .transform({'bill_length_mm': np.log,
                'bill_depth_mm': lambda s: (s - s.mean()) / s.std(),
                'flipper_length_mm': np.log,
                'body_mass_g': [np.log, np.sqrt]})
)

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,body_mass_g
Unnamed: 0_level_1,log,<lambda>,log,log,sqrt
0,3.666122,0.784300,5.198497,8.229511,61.237244
1,3.676301,0.126003,5.225747,8.242756,61.644140
2,3.696351,0.429833,5.273000,8.086410,57.008771
3,,,,,
4,3.602777,1.088129,5.262690,8.146130,58.736701
...,...,...,...,...,...
339,4.021774,1.341320,5.332719,8.294050,63.245553
340,3.772761,0.480471,5.308268,8.131531,58.309519
341,3.903991,0.531109,5.262690,8.236156,61.441029
342,3.927896,0.936215,5.347108,8.318742,64.031242


## 2.3 transform作用于DataFrame的分组过程

In [10]:
# 分组进行缺失值均值填充
(
    penguins
    .groupby('species')[['bill_length_mm', 'bill_depth_mm', 
                         'flipper_length_mm', 'body_mass_g']]
    .transform(lambda s: s.fillna(s.mean().round(2)))
)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.10,18.70,181.00,3750.00
1,39.50,17.40,186.00,3800.00
2,40.30,18.00,195.00,3250.00
3,38.79,18.35,189.95,3700.66
4,36.70,19.30,193.00,3450.00
...,...,...,...,...
339,55.80,19.80,207.00,4000.00
340,43.50,18.10,202.00,3400.00
341,49.60,18.20,193.00,3775.00
342,50.80,19.00,210.00,4100.00
