# Simple Model Build Example
### Part II -- Next up Simple Model Deploy on K8s
2022 June 1

### The Job is to predict level adult earnings per year

Data we have available has incomes above or below $50K based on deomographics 

![alt text](../img/model_grid.png "Model Grid")

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from model.training_data import *

%matplotlib inline

In [None]:
print(file_path)
!ls $file_path

In [None]:
header = get_header()
print("\n".join(header))

In [None]:
df = get_training_dataframe()
df.head(10)

## Explore the Data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
for h in header:
    n = len(df[h].value_counts())
    if n < 20:
        print("##################################")
        print(f"{h}: {n}")
        print(df[h].value_counts())

### Encoding classes will help a classifiers include all the inforamtion available
Start with an easy one...Sex in this data set has only 2 values.

In [None]:
# input encodings
df["sex-val"] = df["sex"].apply(lambda x: 1 if "M" in x else 0)
df[["sex", "sex-val"]].head(10)

### Start modeling with a simple case -- only encoded or continuous data

In [None]:
dft = df[['age','fnlwgt','sex-val','education-num', "capital-gain", "capital-loss", "hours-per-week"]].copy()
y = df["class"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree

### Testing and Training sets help us evaluate performance

![alt text](../img/test_train_split.png "Test Train Split")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(dft, y)

In [None]:
dec_tree=DecisionTreeClassifier().fit(X_train, y_train)
a = dec_tree.score(X_test, y_test)
print('Accuracy is: %', a*100)

In [None]:
y_preds = dec_tree.predict(X_test)
print(classification_report(y_test, y_preds))

### Let's look at the decision tree...

High dimensionality, so maybe too complex to be explanatory? Also, maybe generate some skepticism on how will it will generalize?

In [None]:
text_representation = tree.export_text(dec_tree)
print(text_representation)

### Does it help to normalize continuous values?

In [None]:
#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
test_dec_tree=DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
a = test_dec_tree.score(X_test, y_test)
print('Accuracy is: %', a*100)

In [None]:
y_preds = test_dec_tree.predict(X_test)
print(classification_report(y_test, y_preds))

## Try Deep Learning Model

Requires "1-hot" class incoding

In [None]:
# output encodings
df["gt50k"] = df["class"].apply(lambda x: 1 if ">" in x else 0)
df["lte50k"] = df["class"].apply(lambda x: 1 if "<=" in x else 0)
df.head(10)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler

In [None]:
# Testing/Training Split
dy = df[["gt50k", "lte50k"]]
X_train, X_test, y_train, y_test = train_test_split(dft, dy)

In [None]:
#Normalizing the data
nnsc = StandardScaler()
X_train = nnsc.fit_transform(X_train)
X_test = nnsc.transform(X_test)

In [None]:
# Neural network
model = Sequential()
model.add(Dense(20, input_dim=7, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='softmax'))
print(model.summary())

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,validation_data = (X_test,y_test), epochs=100, batch_size=64)

In [None]:
y_pred = model.predict(X_test)

In [None]:
#Converting 1-hot predictions to class label
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))

In [None]:
# Because 1-hot is (0,1), this is also class label index
test = list(y_test.lte50k)

In [None]:
from sklearn.metrics import accuracy_score
a = accuracy_score(pred,test)
print('Accuracy is: %', a*100)

In [None]:
print(classification_report(test, pred))

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss']) 
plt.plot(history.history['val_loss']) 
plt.title('Model loss') 
plt.ylabel('Loss') 
plt.xlabel('Epoch') 
plt.legend(['Train', 'Test'], loc='upper left') 
plt.show()

## Save trained models for use in prediction algorithms

In [None]:
from joblib import dump, load
dump(dec_tree, "decision_tree.pkl")
dump(model, "neural_net.pkl")
# dump(sc, "decision_tree_scaler.pkl")
dump(nnsc, "neural_net_scaler.pkl")