### 最大值最小值归一化

In [1]:
import numpy as np

x_1 = np.random.randint(0,10,size = 10) # 一维
x_2 = np.random.randint(1000,5000,size = 10) # 一维

# c_合并成二维
X = np.c_[x_1,x_2]

X

array([[   7, 3594],
       [   0, 2698],
       [   6, 1491],
       [   9, 4106],
       [   8, 1569],
       [   6, 2302],
       [   0, 1316],
       [   1, 1065],
       [   8, 4892],
       [   6, 4583]], dtype=int32)

In [2]:
X_norm = (X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))
X_norm.round(2)

array([[0.78, 0.66],
       [0.  , 0.43],
       [0.67, 0.11],
       [1.  , 0.79],
       [0.89, 0.13],
       [0.67, 0.32],
       [0.  , 0.07],
       [0.11, 0.  ],
       [0.89, 1.  ],
       [0.67, 0.92]])

#### 演示离群点

In [3]:
X[6,1] = 123456789
X

array([[        7,      3594],
       [        0,      2698],
       [        6,      1491],
       [        9,      4106],
       [        8,      1569],
       [        6,      2302],
       [        0, 123456789],
       [        1,      1065],
       [        8,      4892],
       [        6,      4583]], dtype=int32)

In [4]:
X_norm = (X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))
X_norm.round(2)

array([[0.78, 0.  ],
       [0.  , 0.  ],
       [0.67, 0.  ],
       [1.  , 0.  ],
       [0.89, 0.  ],
       [0.67, 0.  ],
       [0.  , 1.  ],
       [0.11, 0.  ],
       [0.89, 0.  ],
       [0.67, 0.  ]])

#### sklearn方法调用

In [5]:
from sklearn.preprocessing import MinMaxScaler

In [6]:
np.set_printoptions(suppress=True)

mms = MinMaxScaler()

mms.fit_transform(X).round(2)

array([[0.78, 0.  ],
       [0.  , 0.  ],
       [0.67, 0.  ],
       [1.  , 0.  ],
       [0.89, 0.  ],
       [0.67, 0.  ],
       [0.  , 1.  ],
       [0.11, 0.  ],
       [0.89, 0.  ],
       [0.67, 0.  ]])

### Z-Score归一化（标准化）

#### 根据公式进行计算

In [13]:
import numpy as np

x1 = np.random.randint(0,10,size = (10,1))

x2 = np.random.randint(1000,10000,size = (10,1))

X = np.concatenate([x1,x2],axis = 1)
X

array([[   8, 9245],
       [   9, 6186],
       [   6, 9786],
       [   3, 1383],
       [   5, 5846],
       [   8, 6619],
       [   5, 2208],
       [   5, 8573],
       [   6, 5401],
       [   8, 6216]])

In [16]:
X_norm = (X - X.mean(axis = 0))/X.std(axis = 0)
X_norm # 归一化的结果：正态分布，0,1

array([[ 0.94884747,  1.18993453],
       [ 1.50699304,  0.01524523],
       [-0.16744367,  1.39768441],
       [-1.84188039, -1.82915904],
       [-0.72558924, -0.11531847],
       [ 0.94884747,  0.18152194],
       [-0.72558924, -1.51235006],
       [-0.72558924,  0.93187921],
       [-0.16744367, -0.28620331],
       [ 0.94884747,  0.02676556]])

In [17]:
X_norm.mean(axis = 0)

array([ 0., -0.])

In [18]:
X_norm.std(axis = 0)

array([1., 1.])

#### 使用sklearn库

In [19]:
# preprocessing 预处理，数据
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()

# 第一步，进行训练
standard.fit(X)

# 第二步，转换
X_norm2 = standard.transform(X)

# standard.fit_transform(X) # 一步转换
X_norm2

array([[ 0.94884747,  1.18993453],
       [ 1.50699304,  0.01524523],
       [-0.16744367,  1.39768441],
       [-1.84188039, -1.82915904],
       [-0.72558924, -0.11531847],
       [ 0.94884747,  0.18152194],
       [-0.72558924, -1.51235006],
       [-0.72558924,  0.93187921],
       [-0.16744367, -0.28620331],
       [ 0.94884747,  0.02676556]])

### 归一化实战

#### 不进行归一化

In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

df = pd.read_csv('./zhengqi_train.txt',sep = '\t')
X_train = df.iloc[:,:-1]
y_train = df['target']

X_test = pd.read_csv('./zhengqi_test.txt',sep = '\t')

# display(X_train.head(),y_train.head(),X_test.head())

model = LinearRegression()

model.fit(X_train,y_train)

y_ = model.predict(X_test)

np.savetxt('./result1.txt',y_)
y_[:15]

array([ 0.26825457,  0.2600059 , -0.06905626,  0.09156779,  0.27141095,
        0.19877399, -0.13506734,  0.23545488, -0.0660262 ,  0.28739887,
       -0.57255987, -0.55241158, -0.33249774,  0.09408069, -0.08724635])

#### 最大值最小值归一化

In [31]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('./zhengqi_train.txt',sep = '\t')
X_train = df.iloc[:,:-1]
y_train = df[['target']]

X_test = pd.read_csv('./zhengqi_test.txt',sep = '\t')

display(X_train.head())

# 归一化，数据，特征：工业蒸汽量，生产工艺参数
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
y_train_norm = mms.fit_transform(y_train)
X_test_norm = mms.fit_transform(X_test)

# 训练和预测
model = LinearRegression()
model.fit(X_train_norm, y_train_norm)
y_pred = model.predict(X_test_norm)
np.savetxt('./result2.txt',y_pred)
y_pred[:15]

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37
0,0.566,0.016,-0.143,0.407,0.452,-0.901,-1.812,-2.36,-0.436,-2.114,-0.94,-0.307,-0.073,0.55,-0.484,0.0,-1.707,-1.162,-0.573,-0.991,0.61,-0.4,-0.063,0.356,0.8,-0.223,0.796,0.168,-0.45,0.136,0.109,-0.615,0.327,-4.627,-4.789,-5.101,-2.608,-3.508
1,0.968,0.437,0.066,0.566,0.194,-0.893,-1.566,-2.36,0.332,-2.114,0.188,-0.455,-0.134,1.109,-0.488,0.0,-0.977,-1.162,-0.571,-0.836,0.588,-0.802,-0.063,0.357,0.801,-0.144,1.057,0.338,0.671,-0.128,0.124,0.032,0.6,-0.843,0.16,0.364,-0.335,-0.73
2,1.013,0.568,0.235,0.37,0.112,-0.797,-1.367,-2.36,0.396,-2.114,0.874,-0.051,-0.072,0.767,-0.493,-0.212,-0.618,-0.897,-0.564,-0.558,0.576,-0.477,-0.063,0.355,0.961,-0.067,0.915,0.326,1.287,-0.009,0.361,0.277,-0.116,-0.843,0.16,0.364,0.765,-0.589
3,0.733,0.368,0.283,0.165,0.599,-0.679,-1.2,-2.086,0.403,-2.114,0.011,0.102,-0.014,0.769,-0.371,-0.162,-0.429,-0.897,-0.574,-0.564,0.272,-0.491,-0.063,0.352,1.435,0.113,0.898,0.277,1.298,0.015,0.417,0.279,0.603,-0.843,-0.065,0.364,0.333,-0.112
4,0.684,0.638,0.26,0.209,0.337,-0.454,-1.073,-2.086,0.314,-2.114,-0.251,0.57,0.199,-0.349,-0.342,-0.138,-0.391,-0.897,-0.572,-0.394,0.106,0.309,-0.259,0.352,0.881,0.221,0.386,0.332,1.289,0.183,1.078,0.328,0.418,-0.843,-0.215,0.364,-0.28,-0.028


array([[0.69801031],
       [0.6861968 ],
       [0.65270343],
       [0.67012411],
       [0.69424164],
       [0.68954379],
       [0.63941183],
       [0.69726306],
       [0.63169109],
       [0.69597451],
       [0.57275288],
       [0.57584871],
       [0.58303058],
       [0.64112287],
       [0.6590907 ]])

#### Z-score归一化

In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('./zhengqi_train.txt',sep = '\t')
X_train = df.iloc[:,:-1]
y_train = df[['target']]

X_test = pd.read_csv('./zhengqi_test.txt',sep = '\t')

# 归一化，数据，特征：工业蒸汽量，生产工艺参数
standard = StandardScaler()
X_train_norm = standard.fit_transform(X_train)
y_train_norm = standard.fit_transform(y_train)
X_test_norm = standard.fit_transform(X_test)


# 训练和预测
model = LinearRegression()
model.fit(X_train_norm, y_train_norm)
y_pred = model.predict(X_test_norm)
np.savetxt('./result3.txt',y_pred)
y_pred[:15]

array([[ 0.52564964],
       [ 0.48973218],
       [ 0.32666665],
       [ 0.36998727],
       [ 0.5268688 ],
       [ 0.46154774],
       [ 0.22804888],
       [ 0.48236749],
       [ 0.21097507],
       [ 0.49360554],
       [-0.10965964],
       [-0.09819487],
       [-0.02267627],
       [ 0.26229629],
       [ 0.24690629]])