# Problem description

The aim of this project is to investigate different techniques of feature scaling and their influence on different regression models quality.  
The investigation is based on "MatNavi Mechanical properties of low-alloy steels" dataset: https://www.kaggle.com/datasets/konghuanqing/matnavi-mechanical-properties-of-lowalloy-steels?resource=download which was previously cleaned of otliers.

# 1. Imports and loading data

In [1]:
import urllib.request
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
#url = 'https://raw.githubusercontent.com/ElenaNKn/feature_importance_methods/master/mechanical_properties_low-alloy_steels_cleaned.csv'
#filename = 'mechanical_properties_low-alloy_steels_cleaned.csv'
#urllib.request.urlretrieve(url, filename)

In [3]:
df = pd.read_csv('mechanical_properties_low-alloy_steels_cleaned.csv', header=0, sep=',')

In [4]:
df.head()

Unnamed: 0,Alloy code,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,V,Al,N,Ceq,Nb + Ta,Temperature (°C),0.2% Proof Stress (MPa),Tensile Strength (MPa),Elongation (%),Reduction in Area (%)
0,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,27,342,490,30,71
1,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,100,338,454,27,72
2,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,200,337,465,23,69
3,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,300,346,495,21,70
4,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,400,316,489,26,79


In [5]:
# handle headers of a dataframe

df.columns = df.columns.str.lower().str.lstrip()
df.columns = df.columns.str.replace('+', 'and', regex=True).str.replace('°c', 'celcius', regex=True)
df.columns = df.columns.str.replace('0.2% ', '', regex=True).str.replace('%', 'perc', regex=True)
df.columns = df.columns.str.replace(' ', '_').str.replace('+', 'and', regex=True)
df.columns = df.columns.str.replace('(', '', regex=True).str.replace(')', '', regex=True)
df.columns = df.columns.str.lower().str.replace('ta', 'ti')

In [6]:
del df['ceq']
del df['alloy_code']
del df['tensile_strength_mpa']
del df['elongation_perc']
del df['reduction_in_area_perc']

In [7]:
df.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb_and_ti,temperature_celcius,proof_stress_mpa
0,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,27,342
1,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,100,338
2,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,200,337
3,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,300,346
4,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,400,316


# 2. Setting up the validation framework

In [8]:
# splitting into train - val - test datasets
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# create dataframes for feature variables

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# create dataframes for target variables

dy_train = df_train.proof_stress_mpa
dy_val = df_val.proof_stress_mpa
dy_test = df_test.proof_stress_mpa

df_train.drop('proof_stress_mpa', axis=1, inplace=True)
df_val.drop('proof_stress_mpa', axis=1, inplace=True)
df_test.drop('proof_stress_mpa', axis=1, inplace=True)

# 3. Feature scaling

There are different scaling methods. We'll consider "Min-max normalization", "Max normalization" and "Standardization" (also known as "Z-score normalization") as the most common used.

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

In [10]:
scalings = ['without_scaling', 'min-max', 'max', 'standard']

In [11]:
x = {}
x_val = {}

In [12]:
x['without_scaling'] = df_train
x_val['without_scaling'] = df_val

## 3.1. Min-max normalization

In [13]:
min_max_scaler = MinMaxScaler()
my_features = min_max_scaler.fit_transform(df_train).reshape(-1, 14)
x_1 = pd.DataFrame(my_features, columns = df_train.columns)
x_1.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb_and_ti,temperature_celcius
0,0.28,0.852941,0.188679,0.166667,0.368421,0.316667,0.770992,0.680297,0.6,0.8,0.020833,0.688,0.0,1.0
1,0.76,0.0,0.311321,0.25,0.315789,0.533333,0.763359,0.925651,0.56,0.9,0.0,0.52,0.0,0.919743
2,0.16,0.147059,0.226415,0.166667,0.368421,0.133333,0.045802,0.375465,0.52,0.0,0.041667,0.256,0.0,0.75923
3,0.44,0.382353,0.160377,0.5,0.315789,0.0,0.035115,0.004461,0.2,0.0,0.083333,0.248,0.0,0.277689
4,1.0,0.117647,0.264151,0.416667,0.473684,0.233333,0.076336,0.018587,0.8,0.0,0.0,0.656,0.0,0.598716


In [14]:
my_features_val1 = min_max_scaler.transform(df_val).reshape(-1, 14)
x_val1 = pd.DataFrame(my_features_val1, columns = df_val.columns)

In [15]:
x['min-max'] = x_1
x_val['min-max'] = x_val1

## 3.2. Max normalization

In [16]:
max_scaler = MaxAbsScaler(copy=True)
my_features2 = max_scaler.fit_transform(df_train).reshape(-1, 14)
x_2 = pd.DataFrame(my_features2, columns = df_train.columns)
x_2.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb_and_ti,temperature_celcius
0,0.470588,0.903846,0.418919,0.333333,0.454545,0.316667,0.770992,0.681481,0.6,0.8,0.06,0.74,0.0,1.0
1,0.823529,0.346154,0.506757,0.4,0.409091,0.533333,0.763359,0.925926,0.56,0.9,0.04,0.6,0.0,0.923077
2,0.382353,0.442308,0.445946,0.333333,0.454545,0.133333,0.045802,0.377778,0.52,0.0,0.08,0.38,0.0,0.769231
3,0.588235,0.596154,0.398649,0.6,0.409091,0.0,0.035115,0.008148,0.2,0.0,0.12,0.373333,0.0,0.307692
4,1.0,0.423077,0.472973,0.533333,0.545455,0.233333,0.076336,0.022222,0.8,0.0,0.04,0.713333,0.0,0.615385


In [17]:
my_features_val2 = max_scaler.transform(df_val).reshape(-1, 14)
x_val2 = pd.DataFrame(my_features_val2, columns = df_train.columns)

In [18]:
x['max'] = x_2
x_val['max'] = x_val2

## 3.3. Standartization

In [19]:
st_scaler = StandardScaler()
my_features3 = st_scaler.fit_transform(df_train).reshape(-1, 14)
x_3 = pd.DataFrame(my_features3, columns = df_train.columns)
x_3.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb_and_ti,temperature_celcius
0,-0.229658,1.769609,-0.629272,-0.866808,-0.172429,0.198012,1.289252,1.180537,1.205158,1.787769,-0.767287,1.523274,-0.136209,1.529597
1,1.841228,-1.532154,-0.260058,-0.489726,-0.412082,0.923146,1.267311,1.987846,1.034017,2.094381,-0.839134,0.60659,-0.136209,1.266625
2,-0.74738,-0.962885,-0.515668,-0.866808,-0.172429,-0.415562,-0.795072,0.177517,0.862876,-0.665126,-0.695439,-0.833913,-0.136209,0.740681
3,0.460637,-0.052053,-0.714475,0.64152,-0.412082,-0.861798,-0.825789,-1.043232,-0.506254,-0.665126,-0.551745,-0.877564,-0.136209,-0.837152
4,2.876672,-1.076739,-0.402064,0.264438,0.306879,-0.080885,-0.707311,-0.99675,2.060864,-0.665126,-0.839134,1.348667,-0.136209,0.214737


In [20]:
my_features_val3 = st_scaler.transform(df_val).reshape(-1, 14)
x_val3 = pd.DataFrame(my_features_val3, columns = df_train.columns)

In [21]:
x['standard'] = x_3
x_val['standard'] = x_val3

# 4. Modeling

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [23]:
scores = []

## 4.1. Linear regression model

In [24]:
for s in scalings:
    lr = LinearRegression()
    lr.fit(x[s], dy_train)
    mse = np.sqrt(mean_squared_error(lr.predict(x_val[s]), dy_val))
    scores.append(('mse1_linear', s, mse))

## 4.2. Ridge linear regression model

In [25]:
for s in scalings:
    rlr = Ridge(random_state=42)
    rlr.fit(x[s], dy_train)
    mse = np.sqrt(mean_squared_error(rlr.predict(x_val[s]), dy_val))
    scores.append(('mse2_ridge', s, mse))

## 4.3. Lasso linear regression model

In [26]:
for s in scalings:
    lassoreg = Lasso(random_state=42)
    lassoreg.fit(x[s], dy_train)
    mse = np.sqrt(mean_squared_error(lassoreg.predict(x_val[s]), dy_val))
    scores.append(('mse3_lasso', s, mse))

## 4.4. Decision tree model

In [27]:
for s in scalings:
    dt = DecisionTreeRegressor(random_state=42)
    dt.fit(x[s], dy_train)
    mse = np.sqrt(mean_squared_error(dt.predict(x_val[s]), dy_val))
    scores.append(('mse4_tree', s, mse))

## 4.5. CNN model

In [28]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from tensorflow.keras.layers import Dropout
from numpy.random import seed
from tensorflow.keras.utils import set_random_seed

In [29]:
n_inputs = df_train.shape[1]
n_outputs = 1

loss = 'mse'
optimizer = Adam(learning_rate=0.01)

# settingt the seed
seed(0)
set_random_seed(0)

In [30]:
for s in scalings:
    model = Sequential()
    model.add(Dense(128, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(n_outputs))
    model.compile(loss=loss, optimizer=optimizer)
    history = model.fit(x[s], dy_train, verbose = 0, epochs=300)
    mse = np.sqrt(model.evaluate(x_val[s], dy_val, verbose=0))
    scores.append(('mse5_cnn', s, mse))

# 5. Results

In [31]:
columns = ['model', 'scaling', 'mse']
df_scores = pd.DataFrame(scores, columns=columns)

df_scores_pivot = df_scores.pivot(index='scaling', columns=['model'], values=['mse'])
df_scores_pivot.round(3)

Unnamed: 0_level_0,mse,mse,mse,mse,mse
model,mse1_linear,mse2_ridge,mse3_lasso,mse4_tree,mse5_cnn
scaling,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
max,56.636,55.956,58.15,36.548,36.992
min-max,56.636,56.207,57.469,36.767,32.598
standard,56.636,56.584,56.426,36.759,37.828
without_scaling,56.636,58.14,61.145,36.712,55.385


From the results obtained we see, that scaling doesn't influence at linear regression model. But it helt to improve ridge and lasso models.

And scaling is extremely important for neural networks, min-max scaler being preferable.

According to the algorithm, decision tree model shouldn't be affected with feature scaling. Variance of mean squared error values, that we see for tree model in the pivot table, is concerned with floating point variables rounding issues (I plan to describe this problem in a future article).