# 实例归一化

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

In [5]:
# 实例归一化
scaler = MinMaxScaler()              # 实例化
scaler = scaler.fit(data)            # fit，这里本质是生成min(x),max(x)
result = scaler.transform(data)      # 通过接口导出结果

In [6]:
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [7]:
# 将归一化后的结果逆转
scaler.inverse_transform(result)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [8]:
# 训练和导出结果一步达成
scaler.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [9]:
# 使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

scaler = MinMaxScaler(feature_range = [5, 10])
result = scaler.fit_transform(data)

result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

# 使用numpy实现归一化

In [10]:
import numpy as np
X = np.array([[-1, 2], [-0.5, 6], [0, 10], [1, 18]])
X

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [11]:
# 归一化
X_nor = (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [12]:
# 逆转归一化
X_returned = X_nor*(X.max(axis = 0) - X.min(axis = 0)) + X.min(axis = 0)
X_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

# 数据标准化

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

In [15]:
scaler = StandardScaler()          # 实例化
scaler = scaler.fit(data)          # fit，本质是生成均值和方差

In [16]:
scaler.mean_

array([-0.125,  9.   ])

In [17]:
scaler.var_

array([ 0.546875, 35.      ])

In [18]:
x_std = scaler.transform(data)
x_std

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [19]:
x_std.mean()

0.0

In [20]:
x_std.var()

1.0

In [21]:
scaler.inverse_transform(x_std)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

# 缺失值

In [22]:
import pandas as pd

In [23]:
# 导入数据
# index_col = n 第几列为索引
data = pd.read_csv(r"C:\MyCode\Sklearn\03_DataPreprocessing\Narrativedata.csv",index_col = 0)

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [25]:
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [26]:
# 填补年龄
Age = data.loc[:,"Age"].values.reshape(-1, 1)
Age[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [nan],
       [31.],
       [nan]])

In [27]:
# SimpleImputer
from sklearn.impute import SimpleImputer

In [28]:
# 实例化，默认用均值填补
imp_mean = SimpleImputer()

# 实例化，用中位数填补
imp_median = SimpleImputer(strategy = "median")

# 实例化，用0填补
imp_zero = SimpleImputer(strategy = "constant", fill_value = 0)

In [29]:
# fit_transform一步完成结果
imp_mean = imp_mean.fit_transform(Age)

imp_median = imp_median.fit_transform(Age)

imp_zero = imp_zero.fit_transform(Age)

In [30]:
imp_mean[:20]

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765]])

In [31]:
imp_median[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [28.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [28.],
       [31.],
       [28.]])

In [32]:
imp_zero[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [ 0.],
       [31.],
       [ 0.]])

In [33]:
# 在这里使用中位数填补Age
data.loc[:,"Age"] = imp_median

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [35]:
# 使用众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1, 1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


# 编码与哑变量

## 1.preprocessing.LabelEncoder：标签专用，能够将分类转换为分类数值

In [39]:
data.head(10)

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No
5,28.0,male,Q,No
6,54.0,male,S,No
7,2.0,male,S,No
8,27.0,female,S,Yes
9,14.0,female,C,Yes


In [41]:
# 导入模块
from sklearn.preprocessing import LabelEncoder

# 提取标签，要输入的是标签，不是特征矩阵，允许是一维数据
y = data.iloc[:, -1]

# 实例化
le = LabelEncoder()

# 导入数据
le = le.fit(y)

# transform接口调取结果
label = le.transform(y)


In [42]:
# 查看获取的结果label
label

array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,
       2, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1,
       2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2,
       2, 0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 1, 0,
       0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0,
       1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 0,
       2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1,
       0, 0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0,
       2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2,

In [43]:
# 查看标签中有多少类别
le.classes_

array(['No', 'Unknown', 'Yes'], dtype=object)

In [44]:
# 将原来的标签替换为生成的数值型标签
data.iloc[:, -1] = label

In [46]:
# 查看替换后标签的数据
data.head(10)

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
5,28.0,male,Q,0
6,54.0,male,S,0
7,2.0,male,S,0
8,27.0,female,S,2
9,14.0,female,C,2


In [47]:
# 简略写法
# from sklearn.preprocessing import LabelEncoder
# data.iloc[:, -1] = LabelEncoder.fit_transform(data.iloc[:, -1])

## 2.preprocessing.OrdinalEncoder：特征专用，能够将分类特征转换为分类数值

In [48]:
# 复制并查看数据
data_ = data.copy()
data_

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,28.0,female,S,0
889,26.0,male,C,1


In [49]:
# 导入模块
from sklearn.preprocessing import OrdinalEncoder

In [50]:
# 查看特征里有多少分类
OrdinalEncoder().fit(data_.iloc[:, 1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [51]:
# 将原来的分类特征替换为生成的数值型分类特征
data_.iloc[:, 1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:, 1:-1])

In [52]:
# 查看替换后的数据
data_

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0
...,...,...,...,...
886,27.0,1.0,2.0,0
887,19.0,0.0,2.0,2
888,28.0,0.0,2.0,0
889,26.0,1.0,0.0,1


# 3.preprocessing.OneHotEncoder：独热编码，创建哑变量（用于名义变量）

In [53]:
# 查看数据
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,28.0,female,S,0
889,26.0,male,C,1


In [54]:
# 导入模块
from sklearn.preprocessing import OneHotEncoder

In [56]:
# 取出所有特征
X = data.iloc[:, 1:-1]


In [62]:
X

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [58]:
# 实例化并导入数据
enc = OneHotEncoder(categories = "auto").fit(X)

# 将数据转换为数组类型
result = enc.transform(X).toarray()

In [59]:
# 查看数据
result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [60]:
# 查看特征的名字
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [61]:
# 还原数据
pd.DataFrame(enc.inverse_transform(result))

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [63]:
# 将原数据与新生成的独热码数据合并
newdata = pd.concat([data,pd.DataFrame(result)],axis = 1)

In [64]:
# 查看合并后的数据
newdata

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,27.0,male,S,0,0.0,1.0,0.0,0.0,1.0
887,19.0,female,S,2,1.0,0.0,0.0,0.0,1.0
888,28.0,female,S,0,1.0,0.0,0.0,0.0,1.0
889,26.0,male,C,1,0.0,1.0,1.0,0.0,0.0


In [65]:
# 删除已替换的非数值型数据
newdata.drop(["Sex", "Embarked"], axis = 1, inplace = True)

In [66]:
# 为新生成的数值型数据的特征命名
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]

In [67]:
# 查看新生成的数据
newdata

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,27.0,0,0.0,1.0,0.0,0.0,1.0
887,19.0,2,1.0,0.0,0.0,0.0,1.0
888,28.0,0,1.0,0.0,0.0,0.0,1.0
889,26.0,1,0.0,1.0,1.0,0.0,0.0


## 4.二值化与分段

In [68]:
# 将年龄二值化
# 复制数据
data_2 = data.copy()

In [69]:
data_2

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,28.0,female,S,0
889,26.0,male,C,1


In [70]:
# 导入模块
from sklearn.preprocessing import Binarizer

In [71]:
# 取出年龄这一列（类为特征专用，所以不能用一维数组）
X = data_2.iloc[:, 0].values.reshape(-1, 1)

In [72]:
# 将30作为分界点分为两类
transformer = Binarizer(threshold = 30).fit_transform(X)

In [75]:
# 查看分类后的数据
transformer

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],

In [76]:
# 分箱数据
# 导入模块
from sklearn.preprocessing import KBinsDiscretizer

In [77]:
# 取出年龄这一列（不能为一维数据）
X = data.iloc[:, 0].values.reshape(-1, 1)

In [78]:
# 实例化分箱模块
# “ordinal”：每个特征的每个箱都被编码为一个整数，返回每一列是一个特征，每个特征下含有不同整数编码的箱的矩阵
#  "uniform"：表示等宽分箱，即每个特征中的每个箱的最大值之间的差为(特征.max() - 特征.min())/(n_bins)
est = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy = 'uniform')

# 导入数据并转换导出
est.fit_transform(X)

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [2.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],

In [79]:
# 查看转换后分的箱：变成了一列中的三箱
set(est.fit_transform(X).ravel())           # 降维查看

{0.0, 1.0, 2.0}

In [80]:
# 实例化分箱模块
# "onehot"：做哑变量，之后返回一个稀疏矩阵，每一列是一个特征中的一个类别，含有该类别的样本表示为1，不含的表示为0
est = KBinsDiscretizer(n_bins = 3, encode = 'onehot', strategy= 'uniform')

In [81]:
# 查看替换后分的箱，变成了哑变量
est.fit_transform(X).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])