## import所需的包

In [1]:
import pandas as pd
import numpy as np
import os
import random



## 设置随机种子

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1026)

## 准备数据

In [3]:
df = pd.DataFrame({
    "continuous_var":np.random.random(10).round(2) * 100,
    "discrete_var":np.random.randint(1,4,size=10)
})
df['discrete_var'] = df['discrete_var'].map({1:'A', 2:'B', 3:'C'})

In [4]:
df

Unnamed: 0,continuous_var,discrete_var
0,35.0,B
1,84.0,B
2,96.0,C
3,19.0,A
4,39.0,C
5,29.0,B
6,69.0,C
7,8.0,B
8,4.0,B
9,84.0,B


## 连续型变量

### 分桶特征

In [5]:
df['continuous_var_bin'] = pd.cut(df['continuous_var'], 10, labels=False)

In [6]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin
0,35.0,B,3
1,84.0,B,8
2,96.0,C,9
3,19.0,A,1
4,39.0,C,3
5,29.0,B,2
6,69.0,C,7
7,8.0,B,0
8,4.0,B,0
9,84.0,B,8


### box-cox变换

In [7]:
import scipy.stats as st

df['continuous_var_box-cox'], _ = st.boxcox(df['continuous_var'])

In [8]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox
0,35.0,B,3,9.436134
1,84.0,B,8,15.47757
2,96.0,C,9,16.644178
3,19.0,A,1,6.500577
4,39.0,C,3,10.052343
5,29.0,B,2,8.438842
6,69.0,C,7,13.890122
7,8.0,B,0,3.577364
8,4.0,B,0,1.972265
9,84.0,B,8,15.47757


#### log变换

In [9]:
df['continuous_var_log1p'] = np.log1p(df['continuous_var'])

In [10]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p
0,35.0,B,3,9.436134,3.583519
1,84.0,B,8,15.47757,4.442651
2,96.0,C,9,16.644178,4.574711
3,19.0,A,1,6.500577,2.995732
4,39.0,C,3,10.052343,3.688879
5,29.0,B,2,8.438842,3.401197
6,69.0,C,7,13.890122,4.248495
7,8.0,B,0,3.577364,2.197225
8,4.0,B,0,1.972265,1.609438
9,84.0,B,8,15.47757,4.442651


In [11]:
# 逆运算: np.expm1
np.expm1(df['continuous_var_log1p']) - df['continuous_var']

0    0.000000e+00
1    1.421085e-14
2    0.000000e+00
3   -3.552714e-15
4    0.000000e+00
5    7.105427e-15
6    2.842171e-14
7    1.776357e-15
8   -4.440892e-16
9    1.421085e-14
dtype: float64

### 排序特征

In [12]:
df['continuous_var_rank'] = df['continuous_var'].rank()

In [13]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p,continuous_var_rank
0,35.0,B,3,9.436134,3.583519,5.0
1,84.0,B,8,15.47757,4.442651,8.5
2,96.0,C,9,16.644178,4.574711,10.0
3,19.0,A,1,6.500577,2.995732,3.0
4,39.0,C,3,10.052343,3.688879,6.0
5,29.0,B,2,8.438842,3.401197,4.0
6,69.0,C,7,13.890122,4.248495,7.0
7,8.0,B,0,3.577364,2.197225,2.0
8,4.0,B,0,1.972265,1.609438,1.0
9,84.0,B,8,15.47757,4.442651,8.5


## 离散型变量

### 计数特征

In [14]:
df['discrete_var_count'] = df.groupby(['discrete_var'])['discrete_var'].transform('count')

In [15]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p,continuous_var_rank,discrete_var_count
0,35.0,B,3,9.436134,3.583519,5.0,6
1,84.0,B,8,15.47757,4.442651,8.5,6
2,96.0,C,9,16.644178,4.574711,10.0,3
3,19.0,A,1,6.500577,2.995732,3.0,1
4,39.0,C,3,10.052343,3.688879,6.0,3
5,29.0,B,2,8.438842,3.401197,4.0,6
6,69.0,C,7,13.890122,4.248495,7.0,3
7,8.0,B,0,3.577364,2.197225,2.0,6
8,4.0,B,0,1.972265,1.609438,1.0,6
9,84.0,B,8,15.47757,4.442651,8.5,6


### 离散型变量的排序特征

In [16]:
df['discrete_var_rank'] = df['discrete_var'].map({'A':1, 'B':2, 'C':3})

In [17]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p,continuous_var_rank,discrete_var_count,discrete_var_rank
0,35.0,B,3,9.436134,3.583519,5.0,6,2
1,84.0,B,8,15.47757,4.442651,8.5,6,2
2,96.0,C,9,16.644178,4.574711,10.0,3,3
3,19.0,A,1,6.500577,2.995732,3.0,1,1
4,39.0,C,3,10.052343,3.688879,6.0,3,3
5,29.0,B,2,8.438842,3.401197,4.0,6,2
6,69.0,C,7,13.890122,4.248495,7.0,3,3
7,8.0,B,0,3.577364,2.197225,2.0,6,2
8,4.0,B,0,1.972265,1.609438,1.0,6,2
9,84.0,B,8,15.47757,4.442651,8.5,6,2


### label-encoding

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['discrete_var_LabelEncoder'] = le.fit_transform(df['discrete_var'])

In [19]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p,continuous_var_rank,discrete_var_count,discrete_var_rank,discrete_var_LabelEncoder
0,35.0,B,3,9.436134,3.583519,5.0,6,2,1
1,84.0,B,8,15.47757,4.442651,8.5,6,2,1
2,96.0,C,9,16.644178,4.574711,10.0,3,3,2
3,19.0,A,1,6.500577,2.995732,3.0,1,1,0
4,39.0,C,3,10.052343,3.688879,6.0,3,3,2
5,29.0,B,2,8.438842,3.401197,4.0,6,2,1
6,69.0,C,7,13.890122,4.248495,7.0,3,3,2
7,8.0,B,0,3.577364,2.197225,2.0,6,2,1
8,4.0,B,0,1.972265,1.609438,1.0,6,2,1
9,84.0,B,8,15.47757,4.442651,8.5,6,2,1


### one-hot

In [20]:
pd.get_dummies(df['discrete_var'], prefix = 'discrete_var')

Unnamed: 0,discrete_var_A,discrete_var_B,discrete_var_C
0,0,1,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1
5,0,1,0
6,0,0,1
7,0,1,0
8,0,1,0
9,0,1,0


In [21]:
df = pd.concat([df, pd.get_dummies(df['discrete_var'], prefix = 'discrete_var')], axis = 1)

In [22]:
df

Unnamed: 0,continuous_var,discrete_var,continuous_var_bin,continuous_var_box-cox,continuous_var_log1p,continuous_var_rank,discrete_var_count,discrete_var_rank,discrete_var_LabelEncoder,discrete_var_A,discrete_var_B,discrete_var_C
0,35.0,B,3,9.436134,3.583519,5.0,6,2,1,0,1,0
1,84.0,B,8,15.47757,4.442651,8.5,6,2,1,0,1,0
2,96.0,C,9,16.644178,4.574711,10.0,3,3,2,0,0,1
3,19.0,A,1,6.500577,2.995732,3.0,1,1,0,1,0,0
4,39.0,C,3,10.052343,3.688879,6.0,3,3,2,0,0,1
5,29.0,B,2,8.438842,3.401197,4.0,6,2,1,0,1,0
6,69.0,C,7,13.890122,4.248495,7.0,3,3,2,0,0,1
7,8.0,B,0,3.577364,2.197225,2.0,6,2,1,0,1,0
8,4.0,B,0,1.972265,1.609438,1.0,6,2,1,0,1,0
9,84.0,B,8,15.47757,4.442651,8.5,6,2,1,0,1,0
