In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
data

[[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

In [3]:
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


# 归一化

In [4]:
scaler = MinMaxScaler() # 实例化
scaler = scaler.fit(data) # fit，计算 min(x) 和 max(x)
result = scaler.transform(data) # 导出结果
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [5]:
result_ = scaler.fit_transform(data) # 训练和导出结果一步完成
result_

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [6]:
scaler.inverse_transform(result)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [7]:
# feature_range 参数
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=[5, 10])

In [8]:
result = scaler.fit_transform(data)
result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

In [9]:
# numpy 实现归一化
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
X = np.array(data)

# 归一化
X_nor = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [10]:
# 逆归一化
X_returned = X_nor * (X.max(axis=0) - X.min(axis=0)) + X.min(axis=0)
X_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

# 标准化

In [11]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
print(data)

scaler = StandardScaler()
scaler.fit(data) # 求均值和方差

print(scaler.mean_)
print(scaler.var_)

x_std = scaler.transform(data) # 导出结果
print(x_std)

print(x_std.mean())
print(x_std.std())

[[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
[-0.125  9.   ]
[ 0.546875 35.      ]
[[-1.18321596 -1.18321596]
 [-0.50709255 -0.50709255]
 [ 0.16903085  0.16903085]
 [ 1.52127766  1.52127766]]
0.0
1.0


In [12]:
print(scaler.fit_transform(data))
print(scaler.inverse_transform(x_std))

[[-1.18321596 -1.18321596]
 [-0.50709255 -0.50709255]
 [ 0.16903085  0.16903085]
 [ 1.52127766  1.52127766]]
[[-1.   2. ]
 [-0.5  6. ]
 [ 0.  10. ]
 [ 1.  18. ]]


# 缺失值

In [13]:
data = pd.read_csv("data/Narrativedata.csv", index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [15]:
#填补年龄
Age = data.loc[:,"Age"].values.reshape(-1,1)
print(Age[:20].T)

#sklearn当中特征矩阵必须是二维
imp_mean = SimpleImputer() #实例化,默认均值填补
imp_median = SimpleImputer(strategy="median") #用中位数填补
imp_0 = SimpleImputer(strategy="constant",fill_value=0) #用0填补
imp_mf = SimpleImputer(strategy="most_frequent",fill_value=0) #用众数填补

#fit_transform一步完成调取结果
imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)
imp_mf = imp_mf.fit_transform(Age)

#在这里我们使用众数填补Age
data.loc[:,"Age"] = imp_mf
print("----------------------")
data.info()

#使用众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
print("----------------------")
data.info()

[[22. 38. 26. 35. 35. nan 54.  2. 27. 14.  4. 58. 20. 39. 14. 55.  2. nan
  31. nan]]
----------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
----------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


# 处理分类型特征
# 1. 标签：文字型转数值型

In [16]:
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No
...,...,...,...,...
886,27.0,male,S,No
887,19.0,female,S,Yes
888,24.0,female,S,No
889,26.0,male,C,Unknown


In [17]:
# 选取最后一列
y = data.iloc[:, -1]
y

0           No
1          Yes
2          Yes
3          Yes
4           No
        ...   
886         No
887        Yes
888         No
889    Unknown
890         No
Name: Survived, Length: 891, dtype: object

In [18]:
le = LabelEncoder()
le.fit(y)
label = le.transform(y)

In [19]:
le.classes_

array(['No', 'Unknown', 'Yes'], dtype=object)

In [20]:
# 获取的结果 label
data.iloc[:, -1] = label
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,24.0,female,S,0
889,26.0,male,C,1


# 特征，分类特征转分类数值

In [21]:
data_ = data.copy()
data_

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,24.0,female,S,0
889,26.0,male,C,1


In [22]:
OrdinalEncoder().fit(data_.iloc[:, 0:-1]).categories_

[array([ 0.42,  0.67,  0.75,  0.83,  0.92,  1.  ,  2.  ,  3.  ,  4.  ,
         5.  ,  6.  ,  7.  ,  8.  ,  9.  , 10.  , 11.  , 12.  , 13.  ,
        14.  , 14.5 , 15.  , 16.  , 17.  , 18.  , 19.  , 20.  , 20.5 ,
        21.  , 22.  , 23.  , 23.5 , 24.  , 24.5 , 25.  , 26.  , 27.  ,
        28.  , 28.5 , 29.  , 30.  , 30.5 , 31.  , 32.  , 32.5 , 33.  ,
        34.  , 34.5 , 35.  , 36.  , 36.5 , 37.  , 38.  , 39.  , 40.  ,
        40.5 , 41.  , 42.  , 43.  , 44.  , 45.  , 45.5 , 46.  , 47.  ,
        48.  , 49.  , 50.  , 51.  , 52.  , 53.  , 54.  , 55.  , 55.5 ,
        56.  , 57.  , 58.  , 59.  , 60.  , 61.  , 62.  , 63.  , 64.  ,
        65.  , 66.  , 70.  , 70.5 , 71.  , 74.  , 80.  ]),
 array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S'], dtype=object)]

In [23]:
data_.iloc[:, :-1] = OrdinalEncoder().fit_transform(data_.iloc[:, 0:-1])
data_

Unnamed: 0,Age,Sex,Embarked,Survived
0,28.0,1.0,2.0,0
1,51.0,0.0,0.0,2
2,34.0,0.0,2.0,2
3,47.0,0.0,2.0,2
4,47.0,1.0,2.0,0
...,...,...,...,...
886,35.0,1.0,2.0,0
887,24.0,0.0,2.0,2
888,31.0,0.0,2.0,0
889,34.0,1.0,0.0,1


# 独热编码

In [24]:
X = data.iloc[:, 1:-1]
X

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [25]:
enc = OneHotEncoder(categories="auto").fit(X)
result = enc.transform(X)

In [26]:
# 可以还原
pd.DataFrame(enc.inverse_transform(result))

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [27]:
type(result)

scipy.sparse.csr.csr_matrix

In [28]:
# 返回每一个稀疏矩阵中的列的名字
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [29]:
result.shape

(891, 5)

In [30]:
new_data = pd.concat([data, pd.DataFrame(result)], axis=1)
new_data = new_data.drop(["Sex", "Embarked"], axis=1)
print(new_data.shape)
cols = list(new_data)
cols.insert(1, cols.pop(2))
new_data = new_data.loc[:,cols]
new_data

(891, 3)


Unnamed: 0,Age,0,Survived
0,22.0,"(0, 1)\t1.0\n (0, 4)\t1.0",0
1,38.0,"(0, 0)\t1.0\n (0, 2)\t1.0",2
2,26.0,"(0, 0)\t1.0\n (0, 4)\t1.0",2
3,35.0,"(0, 0)\t1.0\n (0, 4)\t1.0",2
4,35.0,"(0, 1)\t1.0\n (0, 4)\t1.0",0
...,...,...,...
886,27.0,"(0, 1)\t1.0\n (0, 4)\t1.0",0
887,19.0,"(0, 0)\t1.0\n (0, 4)\t1.0",2
888,24.0,"(0, 0)\t1.0\n (0, 4)\t1.0",0
889,26.0,"(0, 1)\t1.0\n (0, 2)\t1.0",1


# 处理连续性特征
# 二值化

In [31]:
data_2 = data.copy()
data_2

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,2
888,24.0,female,S,0
889,26.0,male,C,1


In [32]:
X = data_2.iloc[:, 0].values.reshape(-1, 1)
X.shape

(891, 1)

In [33]:
transformer = Binarizer(threshold=30).fit_transform(X)
print(transformer[:20].T)

[[0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0.]]


# 分箱

In [34]:
X = data.iloc[:, 0].values.reshape(-1, 1)

In [35]:
est1 = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="uniform")
est1.fit_transform(X)
set(est1.fit_transform(X).ravel())

{0.0, 1.0, 2.0}

In [36]:
est2 = KBinsDiscretizer(n_bins=3, encode="onehot", strategy="uniform")
est2.fit_transform(X)

<891x3 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

In [37]:
est3 = KBinsDiscretizer(n_bins=3, encode="onehot-dense", strategy="uniform")
est3.fit_transform(X)

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])