## import所需的包

In [1]:
import pandas as pd
import numpy as np
import os
import random



## 设置随机种子

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0)

## 准备数据

In [3]:
df = pd.DataFrame({'discrete_var1': ['A', 'B', 'C', 'D'] * 3,
                   'discrete_var2': np.random.randint(low=1, high =4, size=12),
                   'continuous_var1': [np.random.randint(0, 1000) for _ in range(12)],
                   'continuous_var2': [np.random.randint(0, 1000) for _ in range(12)]})

df = df.sort_values(by = 'discrete_var1')
df['discrete_var2'] = df['discrete_var2'].map({1: 'a', 2: 'b', 3: 'c'})
df.index = range(len(df))

In [4]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2
0,A,a,705,916
1,A,b,174,709
2,A,a,537,850
3,B,b,486,115
4,B,c,600,847
5,B,a,845,99
6,C,a,551,976
7,C,a,849,431
8,C,a,72,984
9,D,b,87,755


## 离散特征×连续特征

### 计算 最大值、最小值、中位数、均值、方差、和

In [5]:
# 计算 最大值、最小值、中位数、均值、方差、和
df.groupby('discrete_var1').agg({'continuous_var1': ['max', 'min', 'median', 'mean', 'std', 'sum']})

Unnamed: 0_level_0,continuous_var1,continuous_var1,continuous_var1,continuous_var1,continuous_var1,continuous_var1
Unnamed: 0_level_1,max,min,median,mean,std,sum
discrete_var1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,379,21,207.0,202.333333,179.045618,607
B,625,42,319.0,328.666667,291.620187,986
C,607,39,104.0,250.0,310.874573,750
D,929,222,774.0,641.666667,371.61315,1925


In [20]:
temp = df.groupby('discrete_var1').agg({'continuous_var1': ['max', 'min', 'median', 'mean', 'std', 'sum']})
temp.columns = [f'{x[0]}-{x[1]}' for x in temp.columns]
temp = temp.reset_index()
df = pd.merge(df, temp, on = 'discrete_var1', how = 'left')

In [21]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,continuous_var1-max,continuous_var1-min,continuous_var1-median,continuous_var1-mean,continuous_var1-std,continuous_var1-sum
0,A,b,207,265,379,21,207.0,202.333333,179.045618,607
1,A,b,379,209,379,21,207.0,202.333333,179.045618,607
2,A,c,21,908,379,21,207.0,202.333333,179.045618,607
3,B,c,625,254,625,42,319.0,328.666667,291.620187,986
4,B,c,319,279,625,42,319.0,328.666667,291.620187,986
5,B,a,42,949,625,42,319.0,328.666667,291.620187,986
6,C,a,607,433,607,39,104.0,250.0,310.874573,750
7,C,c,39,230,607,39,104.0,250.0,310.874573,750
8,C,a,104,751,607,39,104.0,250.0,310.874573,750
9,D,c,774,949,929,222,774.0,641.666667,371.61315,1925


### 计算偏度和峰度

In [22]:
# 计算偏度和峰度
from scipy.stats import skew, kurtosis
df.groupby('discrete_var1').agg({'continuous_var1': [skew, kurtosis]})

Unnamed: 0_level_0,continuous_var1,continuous_var1
Unnamed: 0_level_1,skew,kurtosis
discrete_var1,Unnamed: 1_level_2,Unnamed: 2_level_2
A,-0.04785,-1.5
B,0.06083,-1.5
C,0.672488,-1.5
D,-0.571246,-1.5


In [23]:
temp = df.groupby('discrete_var1').agg({'continuous_var1': [skew, kurtosis]})
temp.columns = [f'{x[0]}-{x[1]}' for x in temp.columns]
temp = temp.reset_index()
df = pd.merge(df, temp, on = 'discrete_var1', how = 'left')

In [24]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,continuous_var1-max,continuous_var1-min,continuous_var1-median,continuous_var1-mean,continuous_var1-std,continuous_var1-sum,continuous_var1-skew,continuous_var1-kurtosis
0,A,b,207,265,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5
1,A,b,379,209,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5
2,A,c,21,908,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5
3,B,c,625,254,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5
4,B,c,319,279,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5
5,B,a,42,949,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5
6,C,a,607,433,607,39,104.0,250.0,310.874573,750,0.672488,-1.5
7,C,c,39,230,607,39,104.0,250.0,310.874573,750,0.672488,-1.5
8,C,a,104,751,607,39,104.0,250.0,310.874573,750,0.672488,-1.5
9,D,c,774,949,929,222,774.0,641.666667,371.61315,1925,-0.571246,-1.5


### 排序特征

In [26]:
# 排序特征
df['continuous_var1-rank'] = df.groupby('discrete_var1')['continuous_var1'].rank()

In [27]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,continuous_var1-max,continuous_var1-min,continuous_var1-median,continuous_var1-mean,continuous_var1-std,continuous_var1-sum,continuous_var1-skew,continuous_var1-kurtosis,continuous_var1-rank
0,A,b,207,265,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,2.0
1,A,b,379,209,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,3.0
2,A,c,21,908,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,1.0
3,B,c,625,254,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,3.0
4,B,c,319,279,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,2.0
5,B,a,42,949,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,1.0
6,C,a,607,433,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,3.0
7,C,c,39,230,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,1.0
8,C,a,104,751,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,2.0
9,D,c,774,949,929,222,774.0,641.666667,371.61315,1925,-0.571246,-1.5,2.0


### 百分比

In [28]:
# 百分比
df['percentage'] = 100 * df['continuous_var1'] / df.groupby('discrete_var1')['continuous_var1'].transform('sum')

In [29]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,continuous_var1-max,continuous_var1-min,continuous_var1-median,continuous_var1-mean,continuous_var1-std,continuous_var1-sum,continuous_var1-skew,continuous_var1-kurtosis,continuous_var1-rank,percentage
0,A,b,207,265,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,2.0,34.102142
1,A,b,379,209,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,3.0,62.438221
2,A,c,21,908,379,21,207.0,202.333333,179.045618,607,-0.04785,-1.5,1.0,3.459638
3,B,c,625,254,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,3.0,63.387424
4,B,c,319,279,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,2.0,32.352941
5,B,a,42,949,625,42,319.0,328.666667,291.620187,986,0.06083,-1.5,1.0,4.259635
6,C,a,607,433,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,3.0,80.933333
7,C,c,39,230,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,1.0,5.2
8,C,a,104,751,607,39,104.0,250.0,310.874573,750,0.672488,-1.5,2.0,13.866667
9,D,c,774,949,929,222,774.0,641.666667,371.61315,1925,-0.571246,-1.5,2.0,40.207792


## 离散特征×离散特征

### 分组内的种类数

In [6]:
# 分组内的种类数
df.groupby('discrete_var1').agg({'discrete_var2': ['nunique']})

Unnamed: 0_level_0,discrete_var2
Unnamed: 0_level_1,nunique
discrete_var1,Unnamed: 1_level_2
A,2
B,3
C,1
D,2


In [7]:
temp = df.groupby('discrete_var1').agg({'discrete_var2': ['nunique']})
temp.columns = [f'{x[0]}-{x[1]}' for x in temp.columns]
temp = temp.reset_index()
df = pd.merge(df, temp, on = 'discrete_var1', how = 'left')

In [8]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,discrete_var2-nunique
0,A,a,705,916,2
1,A,b,174,709,2
2,A,a,537,850,2
3,B,b,486,115,3
4,B,c,600,847,3
5,B,a,845,99,3
6,C,a,551,976,1
7,C,a,849,431,1
8,C,a,72,984,1
9,D,b,87,755,2


### 离散特征拼接

In [15]:
# 离散特征拼接
df['discrete_var1-var2'] = df['discrete_var1'].astype(str) + '-' + df['discrete_var2'].astype(str)

In [16]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,discrete_var2-nunique,discrete_var1__discrete_var2,discrete_var1-var2
0,A,a,705,916,2,A-a,A-a
1,A,b,174,709,2,A-b,A-b
2,A,a,537,850,2,A-a,A-a
3,B,b,486,115,3,B-b,B-b
4,B,c,600,847,3,B-c,B-c
5,B,a,845,99,3,B-a,B-a
6,C,a,551,976,1,C-a,C-a
7,C,a,849,431,1,C-a,C-a
8,C,a,72,984,1,C-a,C-a
9,D,b,87,755,2,D-b,D-b


## 连续特征×连续特征

### 双目运算

In [20]:
# 双目运算
df['continuous_var1_add_continuous_var2'] = df['continuous_var1'] + df['continuous_var2']
df['continuous_var1_sub_continuous_var2'] = df['continuous_var1'] - df['continuous_var2']
df['continuous_var1_mul_continuous_var2'] = df['continuous_var1'] * df['continuous_var2']
df['continuous_var1_div_continuous_var2'] = df['continuous_var1'] / df['continuous_var2']
df['continuous_var1_mod_continuous_var2'] = df['continuous_var1'] % df['continuous_var2']

In [21]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,discrete_var2-nunique,discrete_var1__discrete_var2,discrete_var1-var2,continuous_var1_add_continuous_var2,continuous_var1_sub_continuous_var2,continuous_var1_mul_continuous_var2,continuous_var1_div_continuous_var2,continuous_var1_mod_continuous_var2
0,A,a,705,916,2,A-a,A-a,1621,-211,645780,0.769651,705
1,A,b,174,709,2,A-b,A-b,883,-535,123366,0.245416,174
2,A,a,537,850,2,A-a,A-a,1387,-313,456450,0.631765,537
3,B,b,486,115,3,B-b,B-b,601,371,55890,4.226087,26
4,B,c,600,847,3,B-c,B-c,1447,-247,508200,0.708383,600
5,B,a,845,99,3,B-a,B-a,944,746,83655,8.535354,53
6,C,a,551,976,1,C-a,C-a,1527,-425,537776,0.564549,551
7,C,a,849,431,1,C-a,C-a,1280,418,365919,1.969838,418
8,C,a,72,984,1,C-a,C-a,1056,-912,70848,0.073171,72
9,D,b,87,755,2,D-b,D-b,842,-668,65685,0.115232,87


### 多列特征计算统计值

In [29]:
# 多列特征计算统计值
df['continuous_var1_var2_max']    = df[['continuous_var1', 'continuous_var2']].max(axis = 1)
df['continuous_var1_var2_min']    = df[['continuous_var1', 'continuous_var2']].min(axis = 1)
df['continuous_var1_var2_median'] = df[['continuous_var1', 'continuous_var2']].median(axis = 1)
df['continuous_var1_var2_mean']   = df[['continuous_var1', 'continuous_var2']].mean(axis = 1)
df['continuous_var1_var2_std']    = df[['continuous_var1', 'continuous_var2']].std(axis = 1)
df['continuous_var1_var2_sum']    = df[['continuous_var1', 'continuous_var2']].sum(axis = 1)

In [30]:
df

Unnamed: 0,discrete_var1,discrete_var2,continuous_var1,continuous_var2,discrete_var2-nunique,discrete_var1__discrete_var2,discrete_var1-var2,continuous_var1_add_continuous_var2,continuous_var1_sub_continuous_var2,continuous_var1_mul_continuous_var2,continuous_var1_div_continuous_var2,continuous_var1_mod_continuous_var2,continuous_var1_var2_mean,continuous_var1_var2_max,continuous_var1_var2_min,continuous_var1_var2_median,continuous_var1_var2_std,continuous_var1_var2_sum
0,A,a,705,916,2,A-a,A-a,1621,-211,645780,0.769651,705,810.5,916,705,810.5,149.199531,1621
1,A,b,174,709,2,A-b,A-b,883,-535,123366,0.245416,174,441.5,709,174,441.5,378.302128,883
2,A,a,537,850,2,A-a,A-a,1387,-313,456450,0.631765,537,693.5,850,537,693.5,221.324423,1387
3,B,b,486,115,3,B-b,B-b,601,371,55890,4.226087,26,300.5,486,115,300.5,262.336616,601
4,B,c,600,847,3,B-c,B-c,1447,-247,508200,0.708383,600,723.5,847,600,723.5,174.655375,1447
5,B,a,845,99,3,B-a,B-a,944,746,83655,8.535354,53,472.0,845,99,472.0,527.501659,944
6,C,a,551,976,1,C-a,C-a,1527,-425,537776,0.564549,551,763.5,976,551,763.5,300.520382,1527
7,C,a,849,431,1,C-a,C-a,1280,418,365919,1.969838,418,640.0,849,431,640.0,295.570635,1280
8,C,a,72,984,1,C-a,C-a,1056,-912,70848,0.073171,72,528.0,984,72,528.0,644.881384,1056
9,D,b,87,755,2,D-b,D-b,842,-668,65685,0.115232,87,421.0,755,87,421.0,472.34733,842
