# This notebook documents and contains the code for engineering the data and saving it properly for model training

Firstly, lets load the data from last part

In [None]:
# necessary packages
!pip install pandas 
!pip install numpy
!pip install scikit-learn


In [51]:
import pandas as pd

df = pd.read_csv('data/post_ETL.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,1,163,1,0.6,2,0,2,1


The two processing parts that will be done are to:
* Normalize the data to the range(0,1)
* Add a bias term


In [52]:
from sklearn.preprocessing import MinMaxScaler

# normalize
scaler = MinMaxScaler()
scaler.fit(df.values[:, :-1]) # dont scale labels since they already are on {0,1}
data = scaler.transform(df.values[:, :-1])
norm_df = pd.DataFrame(data)
norm_df.columns = df.columns[:-1]
norm_df['output'] = df['output']
df = norm_df

df.head()


Unnamed: 0,age,sex,cp,trtbps,chol,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.708333,1.0,1.0,0.481132,0.244292,0.0,0.603053,0.0,0.370968,0.0,0.0,0.333333,1
1,0.166667,1.0,0.666667,0.339623,0.283105,0.5,0.885496,0.0,0.564516,0.0,0.0,0.666667,1
2,0.25,0.0,0.333333,0.339623,0.178082,0.0,0.770992,0.0,0.225806,1.0,0.0,0.666667,1
3,0.5625,1.0,0.333333,0.245283,0.251142,0.5,0.816794,0.0,0.129032,1.0,0.0,0.666667,1
4,0.583333,0.0,0.0,0.245283,0.520548,0.5,0.70229,1.0,0.096774,1.0,0.0,0.666667,1


In [53]:
# This scaler will be needed in the future when supplying new datapoints (inference), so let's save it
from pickle import dump

dump(scaler, open('model/scaler.pkl', 'wb'))

In [54]:
# add bias
#bias = pd.DataFrame(1, index=range(len(df)),columns=['bias'])
#bias.head(3)

df.insert(0, 'bias', 1.)
df.head()

Unnamed: 0,bias,age,sex,cp,trtbps,chol,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,1.0,0.708333,1.0,1.0,0.481132,0.244292,0.0,0.603053,0.0,0.370968,0.0,0.0,0.333333,1
1,1.0,0.166667,1.0,0.666667,0.339623,0.283105,0.5,0.885496,0.0,0.564516,0.0,0.0,0.666667,1
2,1.0,0.25,0.0,0.333333,0.339623,0.178082,0.0,0.770992,0.0,0.225806,1.0,0.0,0.666667,1
3,1.0,0.5625,1.0,0.333333,0.245283,0.251142,0.5,0.816794,0.0,0.129032,1.0,0.0,0.666667,1
4,1.0,0.583333,0.0,0.0,0.245283,0.520548,0.5,0.70229,1.0,0.096774,1.0,0.0,0.666667,1


In [55]:
# As a last step, lets make the output column a "categorical one" (binary)
df['output'] = df['output'].astype('category')

# lets save our new dataset
df.to_csv('data/post_FE.csv', index=False)