# 数据预处理

In [1]:
import numpy as np

In [2]:
def f(df):
    # 求均值
    mean = np.mean(df)
    # 求标准差
    std = np.std(df, ddof=1)
    result = (df - mean) / std
    return result

In [3]:
raw_sample = np.array([[3.0, -100.0, 2.0],
                       [0.0, 400.0, 3.0],
                       [1.0, -400.0, 2.0]])

In [4]:
std_sample = raw_sample.copy()
std_samples = raw_sample.copy()

In [5]:
for col in std_sample.T:
    print(col)
    col_mean = col.mean()
    print(col_mean)
    col_std = col.std()
    print(col_std)
    col -= col_mean  #这一步数据平均值变成0
    col /= col_std
    print(col)


[3. 0. 1.]
1.3333333333333333
1.247219128924647
[ 1.33630621 -1.06904497 -0.26726124]
[-100.  400. -400.]
-33.333333333333336
329.98316455372213
[-0.20203051  1.31319831 -1.1111678 ]
[2. 3. 2.]
2.3333333333333335
0.4714045207910317
[-0.70710678  1.41421356 -0.70710678]


In [6]:
print(f([8, 2, 5]))

[ 1. -1.  0.]


In [7]:
for col in std_sample.T:
    col_mean = col.mean()
    col_std = col.std()
    col -= col_mean  #这一步数据平均值变成0
    col /= col_std

print(std_sample)

[[ 1.33630621 -0.20203051 -0.70710678]
 [-1.06904497  1.31319831  1.41421356]
 [-0.26726124 -1.1111678  -0.70710678]]


In [8]:
for col in std_samples:
    col_mean = col.mean()
    col_std = col.std()
    col -= col_mean  #这一步数据平均值变成0
    col /= col_std

print(std_samples)

[[ 0.71742908 -1.41416309  0.69673401]
 [-0.71507648  1.41418351 -0.69910703]
 [ 0.7044634  -1.41421027  0.70974687]]


In [9]:
for col in std_sample.T:
    print(col)

[ 1.33630621 -1.06904497 -0.26726124]
[-0.20203051  1.31319831 -1.1111678 ]
[-0.70710678  1.41421356 -0.70710678]


In [10]:
for col in std_sample:
    print(col)

[ 1.33630621 -0.20203051 -0.70710678]
[-1.06904497  1.31319831  1.41421356]
[-0.26726124 -1.1111678  -0.70710678]


In [11]:
std_sample

array([[ 1.33630621, -0.20203051, -0.70710678],
       [-1.06904497,  1.31319831,  1.41421356],
       [-0.26726124, -1.1111678 , -0.70710678]])

In [12]:
import sklearn.preprocessing as sp

In [13]:
std_samples = sp.scale(raw_sample)  # 求标准移除
#这种方法是将原数据进行标准移除
#具体方法就是原始数据每列数据转化为均值为0标准差为1的数据
print(std_sample)
print(std_sample.mean(axis=0))
print(std_sample.std(axis=0))

[[ 1.33630621 -0.20203051 -0.70710678]
 [-1.06904497  1.31319831  1.41421356]
 [-0.26726124 -1.1111678  -0.70710678]]
[3.70074342e-17 0.00000000e+00 0.00000000e+00]
[1. 1. 1.]


In [14]:
#范围缩放
raw_sample = np.array([[1.0, 2.0, 3.0],
                       [4.0, 5.0, 6.0],
                       [7.0, 8.0, 9.0]])

In [15]:
mms_samples = raw_sample.copy()

In [16]:
for col in mms_samples.T:
    col_min = col.min()
    col_max = col.max()
    col -= col_min
    col /= (col_max - col_min)
print(mms_samples)

[[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]


In [17]:
# 根据给定范围创建一个范围缩放器对象
mms = sp.MinMaxScaler(feature_range=(0, 1))  # 定义对象(修改范围观察现象)
# 使用范围缩放器实现特征值范围缩放
mms_samples = mms.fit_transform(raw_sample)  # 缩放
#这里可以先fit，然后再transform分开写也行
print(mms_samples)

[[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]


In [18]:
# 样本数据
raw_samples = np.array([
    [10.0, 20.0, 5.0],
    [8.0, 10.0, 1.0]
])
print(raw_samples)
nor_samples = raw_samples.copy()  # 复制样本数据

for row in nor_samples:
    row /= abs(row).sum()  # 先对行求绝对值，再求和，再除以绝对值之和

print(nor_samples)  # 打印结果

[[10. 20.  5.]
 [ 8. 10.  1.]]
[[0.28571429 0.57142857 0.14285714]
 [0.42105263 0.52631579 0.05263158]]


In [19]:
nor_samples = sp.normalize(raw_samples, norm='l1')
print(nor_samples)  # 打印结果

[[0.28571429 0.57142857 0.14285714]
 [0.42105263 0.52631579 0.05263158]]


In [20]:
raw_samples = np.array([[65.5, 89.0, 73.0],
                        [55.0, 99.0, 98.5],
                        [45.0, 22.5, 60.0]])
bin_samples = raw_samples.copy()  # 复制数组

bin_samples = raw_samples.copy()  # 复制数组
# 生成掩码数组
mask1 = bin_samples < 60  #一定要先判断小于60的
mask2 = bin_samples >= 60

bin_samples[mask1] = 0
bin_samples[mask2] = 1
print(bin_samples)  # 打印结果

[[1. 1. 1.]
 [0. 1. 1.]
 [0. 0. 1.]]


In [21]:
res = np.where(bin_samples > 60, 1.0, 0.0)  #这有隐含if else
res

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [22]:
bin = sp.Binarizer(threshold=59)  # 创建二值化对象(注意边界值)
bin_samples = bin.transform(raw_samples)  # 二值化预处理
print(bin_samples)

[[1. 1. 1.]
 [0. 1. 1.]
 [0. 0. 1.]]


In [31]:
raw_samples = np.array([[1, 3, 2],
                        [7, 5, 4],
                        [1, 8, 6],
                        [7, 3, 9]])

one_hot_encoder = sp.OneHotEncoder(
    sparse=False, # 是否采用稀疏格式
    dtype="int32",
    categories="auto")# 自动编码
oh_samples = one_hot_encoder.fit_transform(raw_samples) # 执行独热编码
print(oh_samples)

[[1 0 1 0 0 1 0 0 0]
 [0 1 0 1 0 0 1 0 0]
 [1 0 0 0 1 0 0 1 0]
 [0 1 1 0 0 0 0 0 1]]


In [32]:
oh_samples

array([[1, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 1]], dtype=int32)

In [33]:
print(one_hot_encoder.inverse_transform(oh_samples)) # 解码 一定要使用原编码器解码

[[1 3 2]
 [7 5 4]
 [1 8 6]
 [7 3 9]]


In [34]:
raw_samples = np.array(['audi', 'ford', 'audi',
                        'bmw','ford', 'bmw'])

lb_encoder = sp.LabelEncoder() # 定义标签编码对象
lb_samples = lb_encoder.fit_transform(raw_samples) # 执行标签编码
print(lb_samples)

print(lb_encoder.inverse_transform(lb_samples)) # 逆向转换 # 解码 一定要使用原编码器解码

[0 2 0 1 2 1]
['audi' 'ford' 'audi' 'bmw' 'ford' 'bmw']
