### Machine Learning for Engineers: [ScaleData](https://www.apmonitor.com/pds/index.php/Main/ScaleData)
- [Scale Data for Machine Learning](https://www.apmonitor.com/pds/index.php/Main/ScaleData)
 - Source Blocks: 12
 - Description: Scaling data to a range of 0 to 1 can improves machine learning performance for certain algorithms such as neural networks.
- [Course Overview](https://apmonitor.com/pds)
- [Course Schedule](https://apmonitor.com/pds/index.php/Main/CourseSchedule)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Generate a distribution
x = 0.5*np.random.randn(1000)+4

# Standard (mean=0, stdev=1) Scaler
y = (x-np.mean(x))/np.std(x)

# Min-Max (0-1) Scaler
z = (x-np.min(x))/(np.max(x)-np.min(x))

# Plot distributions
plt.figure(figsize=(8,4))
plt.hist(x, bins=30, label='original')
plt.hist(y, alpha=0.7, bins=30, label='standard scaler')
plt.hist(z, alpha=0.7, bins=30, label='minmax scaler')
plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
s_train = s.fit_transform(train)

In [None]:
print('a: ', s.scale_)
print('Scaler mean')
print('b: ', s.mean_)

In [None]:
s_test = s.fit_transform(test)

In [None]:
# convert scaled values back to dataframe
s_train_df = pd.DataFrame(s_train, columns=train.columns.values)
s_test_df = pd.DataFrame(s_test, columns=test.columns.values)

In [None]:
from sklearn.preprocessing import MinMaxScaler
s = MinMaxScaler(feature_range=(0,1))
s_train = s.fit_transform(train)
s_test  = s.transform(test)

In [None]:
print('Scaler multipliers')
print('a: ', s.scale_)
print('Scaler minimum')
print('b: ', s.min_)

In [None]:
x = s.inverse_transform(y)

In [None]:
import pandas as pd
data = pd.read_csv('http://apmonitor.com/pds/uploads/Main/tclab_data6.txt')
data.set_index('Time',inplace=True)
data.plot(kind='hist',alpha=0.7,bins=30,figsize=(8,4))

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
sdata = s.fit_transform(data)
sdata = pd.DataFrame(sdata, columns=data.columns.values, index=data.index)
sdata.plot(kind='hist',alpha=0.7,bins=10,figsize=(8,4))

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# split into training (80%) and testing (20%)
train, test = train_test_split(sdata, test_size=0.2, shuffle=True)
train=train.copy(); test=test.copy()

# train neural network
nn = MLPRegressor(hidden_layer_sizes=(3,3),activation='tanh',\
                  solver='lbfgs',max_iter=5000)
model = nn.fit(train[['Q1','T1']],train['T2'])

# test neural network
predict = test.copy()
predict['T2'] = nn.predict(test[['Q1','T1']])

# unscale data
d1 = s.inverse_transform(test)
d2 = s.inverse_transform(predict)
test_results = pd.DataFrame({'T2':d1[:,-1],'T2p':d2[:,-1]})

# plot results
test_results.plot(x='T2',y='T2p',kind='scatter')
plt.plot([15,28],[15,28],'r-')
plt.savefig('results.png',dpi=600)
plt.show()

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

data = pd.read_csv('http://apmonitor.com/pds/uploads/Main/tclab_data6.txt')
data.set_index('Time',inplace=True)

# split into training (80%) and testing (20%)
train, test = train_test_split(data, test_size=0.2, shuffle=True)
train=train.copy(); test=test.copy()

# train neural network
nn = MLPRegressor(hidden_layer_sizes=(3,3),activation='tanh',\
                  solver='lbfgs',max_iter=5000)
model = nn.fit(train[['Q1','T1']],train['T2'])

# test neural network
test['T2p'] = nn.predict(test[['Q1','T1']])

# plot results
test.plot(x='T2',y='T2p',kind='scatter')
plt.plot([15,28],[15,28],'r-')
plt.show()