### 1. 当数值型特征的取值在某区间内时取值为1，否则为0

In [1]:
import numpy as np
import pandas as pd 


rdg = np.random.RandomState(2017)
age = rdg.randint(1, 78, 20)
fare = rdg.uniform(10, 100, 20)
df = pd.DataFrame({'Age':age, 'Fare':fare}).round(2)
df.head()

Unnamed: 0,Age,Fare
0,60,76.21
1,10,71.77
2,71,24.76
3,14,71.5
4,43,43.19


In [2]:
df['Age_above_50'] = df['Age'].map(lambda x: 1 if x>=50 else 0)
df.head()

Unnamed: 0,Age,Fare,Age_above_50
0,60,76.21,1
1,10,71.77,0
2,71,24.76,1
3,14,71.5,0
4,43,43.19,0


### 2. 对数值型特征分段、离散化，并构造哑变量

#### 2.1 运用np.digitize对对年龄分段

In [3]:
# 将年龄分为3个区间
bins = np.linspace(df['Age'].min(), df['Age'].max()+1, 4)
bins

array([ 1.        , 25.33333333, 49.66666667, 74.        ])

In [4]:
age_bins = np.digitize(df['Age'], bins)
age_bins

array([3, 1, 3, 1, 2, 1, 3, 3, 1, 2, 1, 1, 3, 1, 2, 2, 2, 1, 3, 2],
      dtype=int64)

In [5]:
age_dummies = pd.get_dummies(age_bins, prefix='AgeBin')
df = df.join(age_dummies)
df.head()

Unnamed: 0,Age,Fare,Age_above_50,AgeBin_1,AgeBin_2,AgeBin_3
0,60,76.21,1,0,0,1
1,10,71.77,0,1,0,0
2,71,24.76,1,0,0,1
3,14,71.5,0,1,0,0
4,43,43.19,0,0,1,0


#### 2.2 自定义区间对费用分段

In [6]:
def fare_rate_func(x):
    if x <= np.percentile(df['Fare'], 25):
        return 'high'
    elif np.percentile(df['Fare'],25) < x <= np.percentile(df['Fare'], 75):
        return 'middle'
    else:
        return 'low'
    
df['fare_rate'] = df['Fare'].apply(fare_rate_func)
fare_dummies = pd.get_dummies(df['fare_rate'], prefix='Fare')
df.drop(['fare_rate'], axis=1, inplace=True)
df = df.join(fare_dummies)
df.head()

Unnamed: 0,Age,Fare,Age_above_50,AgeBin_1,AgeBin_2,AgeBin_3,Fare_high,Fare_low,Fare_middle
0,60,76.21,1,0,0,1,0,1,0
1,10,71.77,0,1,0,0,0,0,1
2,71,24.76,1,0,0,1,1,0,0
3,14,71.5,0,1,0,0,0,0,1
4,43,43.19,0,0,1,0,1,0,0
