## First, testing our Azure endpoint

In [1]:
import requests

In [2]:
test_data = {
    'age': 45.0,
    'sex': 1.0,
    'chest_pain': 4.0,
    'blood_pressure': 115.0,
    'serum_cholestoral': 260.0,
    'fasting_blood_sugar': 0.0,
    'electrocardiographic': 2.0,
    'max_heart_rate': 185.0,
    'induced_angina': 0.0,
    'ST_depression': 0.0,
    'slope': 1.0,
    'vessels': 0.0,
    'thal': 3.0,
}

In [3]:
#base_url = 'https://mytestmlappnkf.azurewebsites.net'
base_url = 'https://mytestmlappcd.azurewebsites.net'

In [4]:
r = requests.get(base_url)

In [5]:
r.text

'App is Healthy'

In [6]:
#neural_url = 'https://mytestmlappnkf.azurewebsites.net/neural'
neural_url = 'https://mytestmlappcd.azurewebsites.net/neural'

In [7]:
r = requests.post(neural_url, json = test_data)

In [8]:
r.json() 

0

## Data for Mini-Project

In [9]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)

%config Completer.use_jedi = False

https://www.kaggle.com/adammaus/predicting-churn-for-bank-customers?select=Churn_Modelling.csv

In [10]:
data_path = pathlib.Path('data', 'train.csv')
data = pd.read_csv(data_path)

In [11]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,597,Germany,Female,35,8,131101.04,1,1,1,192852.67,0
1,523,France,Female,40,2,102967.41,1,1,0,128702.10,1
2,706,Spain,Female,42,8,95386.82,1,1,1,75732.25,0
3,788,France,Male,32,4,112079.58,1,0,0,89368.59,0
4,706,Germany,Male,38,5,163034.82,2,1,1,135662.17,0
...,...,...,...,...,...,...,...,...,...,...,...
9965,790,Spain,Male,20,8,0.00,2,1,0,168152.76,0
9966,658,France,Female,39,4,0.00,1,1,1,147530.06,0
9967,505,Germany,Female,37,10,122453.97,2,1,1,52693.99,0
9968,751,France,Female,30,6,0.00,2,1,0,15766.10,0


In [12]:
data.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [13]:
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [14]:

cols_to_consider = ['Geography', 'Gender']
for col in cols_to_consider:
    data = pd.concat([data, pd.get_dummies(data[col])], axis = 1)
    data = data.drop(columns = [col])
#### The JSON format that your Azure endpoint should expect follows is shown below.

target = 'Exited'
features = [col for col in data.columns if col != target]
data.loc[0, features].to_dict()

{'CreditScore': 597.0,
 'Age': 35.0,
 'Tenure': 8.0,
 'Balance': 131101.04,
 'NumOfProducts': 1.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 1.0,
 'EstimatedSalary': 192852.67,
 'France': 0.0,
 'Germany': 1.0,
 'Spain': 0.0,
 'Female': 1.0,
 'Male': 0.0}

In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import seaborn as sns
sns.set_style('whitegrid')

from bokeh.layouts import gridplot, column
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource, 
                          HoverTool, LabelSet, LinearColorMapper, NumeralTickFormatter)
from bokeh.palettes import brewer, RdBu, Reds
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import transform

%config Completer.use_jedi = False
output_notebook()

In [16]:
def plot_confusion_matrix(y_true, y_predicted):
    
    from sklearn import metrics
    
    accuracy = np.round(100*(y_true == y_predicted).astype(int).sum()/len(y_predicted), 2)
    
    confusion = pd.DataFrame(metrics.confusion_matrix(y_true, y_predicted))
    confusion.index.name = "True"
    confusion.columns.name = "Predicted"
    confusion = confusion.stack().rename("value").reset_index()
    confusion['True'] = confusion['True'].astype(str)
    confusion['Predicted'] = confusion['Predicted'].astype(str)

    source = ColumnDataSource(confusion)

    values = sorted(list(confusion['True'].unique()))

    palette = brewer['RdBu'][10]
    color_mapper = LinearColorMapper(
        palette = palette, 
    )

    p = figure(
        plot_width = 300, 
        plot_height = 300, 
        title = f'Confusion Matrix: Overall accuracy = {accuracy}%',
        x_range = ['0', '1'], 
        y_range = ['0', '1'],
        x_axis_label = 'Predicted',
        y_axis_label = 'True',
        tools = 'hover', 
        x_axis_location="below",
    )

    p.rect(
        x = 'Predicted', 
        y = 'True', 
        width = 1, 
        height = 1, 
        source = source,
        line_color = 'grey', 
        fill_color = transform('value', color_mapper),
    )

    hover = p.hover.tooltips = [
        ("True", "@{True}"),
        ("Predicted", "@{Predicted}"),
        ("Count", "@value"),
    ]

    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "14px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    
    labels = LabelSet(x='Predicted', y='True', text='value',
                      render_mode='canvas', text_color = 'white',
                      x_offset = 50, y_offset = 50, source=source,)

    p.add_layout(labels)

    show(p)

In [18]:
data.describe(percentiles = [0.5]).transpose()

Unnamed: 0,count,mean,std,min,50%,max
CreditScore,9970.0,650.5796,96.6353,350.0,652.0,850.0
Age,9970.0,38.9257,10.4905,18.0,37.0,92.0
Tenure,9970.0,5.0135,2.8918,0.0,5.0,10.0
Balance,9970.0,76485.4634,62400.275,0.0,97221.52,250898.09
NumOfProducts,9970.0,1.53,0.5817,1.0,1.0,4.0
HasCrCard,9970.0,0.7054,0.4559,0.0,1.0,1.0
IsActiveMember,9970.0,0.515,0.4998,0.0,1.0,1.0
EstimatedSalary,9970.0,100069.8759,57510.557,11.58,100168.24,199992.48
Exited,9970.0,0.2038,0.4029,0.0,0.0,1.0
France,9970.0,0.5016,0.5,0.0,1.0,1.0


In [19]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit(data[features])
scaled_data = scaler.fit_transform(data[features])
scaled_data = pd.DataFrame(scaled_data, columns = features)
scaled_data[target] = data[target]

In [20]:
scaled_data.describe(percentiles = [0.5]).transpose()

Unnamed: 0,count,mean,std,min,50%,max
CreditScore,9970.0,0.0,1.0001,-3.1106,0.0147,2.0637
Age,9970.0,0.0,1.0001,-1.9948,-0.1836,5.0595
Tenure,9970.0,-0.0,1.0001,-1.7338,-0.0047,1.7245
Balance,9970.0,0.0,1.0001,-1.2258,0.3323,2.7952
NumOfProducts,9970.0,-0.0,1.0001,-0.9111,-0.9111,4.2462
HasCrCard,9970.0,-0.0,1.0001,-1.5475,0.6462,0.6462
IsActiveMember,9970.0,0.0,1.0001,-1.0306,0.9703,0.9703
EstimatedSalary,9970.0,-0.0,1.0001,-1.7399,0.0017,1.7376
France,9970.0,0.0,1.0001,-1.0032,0.9968,0.9968
Germany,9970.0,-0.0,1.0001,-0.5788,-0.5788,1.7277


In [21]:
pd.value_counts(scaled_data[target])/pd.value_counts(scaled_data[target]).sum()

0   0.7962
1   0.2038
Name: Exited, dtype: float64

In [23]:
from sklearn.model_selection import train_test_split

for i in range(1000):
    train, test = train_test_split(scaled_data, random_state = i)
    temp = pd.value_counts(train[target])/pd.value_counts(train[target]).sum()
    if temp[0] < temp[1]:
        print(i)

In [24]:
for i in range(1000):
    train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = i)
    temp = pd.value_counts(train[target])/pd.value_counts(train[target]).sum()
    if temp[0] < temp[1]:
        print(i)

In [25]:
train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 809)
pd.value_counts(train[target])/pd.value_counts(train[target]).sum()

0   0.7962
1   0.2038
Name: Exited, dtype: float64

In [26]:
train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 809)
x_train, y_train = train[features], train[target]
x_test, y_test = test[features], test[target]

## Linear Regression 80.47 Accuracy

In [27]:
import statsmodels.formula.api as smf

formula = f"{target} ~ {' + '.join(features)}"

model = smf.ols(
    formula = formula, 
    data = train)

fit_model = model.fit()

fit_model.summary()

predictions = fit_model.predict(test[features])

predictions =(predictions > 0.5).astype(int)
 
predictions

predictions = fit_model.predict(test[features])

predictions = (predictions > 0.5).astype(int)

plot_confusion_matrix(y_test.values, predictions.values)

## Logistic Regression 80.14 Accuracy

In [29]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver = 'newton-cg')

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

## Decision Tree 83.31 Accuracy

In [31]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

for max_depth in range(1, 10):
    clf = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 0)

    clf = clf.fit(x_train, y_train)
    
    score = clf.score(x_test, y_test)
    
    print(f'Max depth = {max_depth}: {score}') 
    
clf = tree.DecisionTreeClassifier(max_depth = 3, random_state = 0)

clf = clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)

print(tree.export_text(clf, feature_names = features))

predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

Max depth = 1: 0.7962294424388288
Max depth = 2: 0.8215002005615724
Max depth = 3: 0.8331327717609306
Max depth = 4: 0.8435619735258725
Max depth = 5: 0.8435619735258725
Max depth = 6: 0.8515844364219816
Max depth = 7: 0.8427597272362616
Max depth = 8: 0.8447653429602888
Max depth = 9: 0.8363417569193743
|--- Age <= 0.34
|   |--- NumOfProducts <= 1.67
|   |   |--- NumOfProducts <= -0.05
|   |   |   |--- class: 0
|   |   |--- NumOfProducts >  -0.05
|   |   |   |--- class: 0
|   |--- NumOfProducts >  1.67
|   |   |--- Balance <= -0.31
|   |   |   |--- class: 1
|   |   |--- Balance >  -0.31
|   |   |   |--- class: 1
|--- Age >  0.34
|   |--- IsActiveMember <= -0.03
|   |   |--- Age <= 1.10
|   |   |   |--- class: 0
|   |   |--- Age >  1.10
|   |   |   |--- class: 1
|   |--- IsActiveMember >  -0.03
|   |   |--- NumOfProducts <= 1.67
|   |   |   |--- class: 0
|   |   |--- NumOfProducts >  1.67
|   |   |   |--- class: 1



## Random Forest 85 Accuracy 

In [32]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

y_hat = clf.predict(x_test)
plot_confusion_matrix(y_test, y_hat)

## Gradient Boosted Trees 85.92 Accuracy 

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

## AdaBoost 84.16 Accuracy

In [37]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

## Hyperparameter Tuning 84.76 Accuracy 

In [38]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [5, 10, 50, 100],
    'learning_rate': [0.001, 0.01, 0.1, 1, 10],
}

adaboost = AdaBoostClassifier(random_state = 0)

clf = GridSearchCV(adaboost, params, error_score=0)
search = clf.fit(x_train, y_train)
best_params = search.best_params_
best_params

clf = AdaBoostClassifier(random_state = 0, **best_params)
clf = clf.fit(x_train, y_train)
clf.score(x_test, y_test)

predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

summary = pd.DataFrame(search.cv_results_)
param_columns = [col for col in summary.columns if col.startswith('param') and (col != 'params')]

metric_col = 'mean_test_score'
summary = summary[param_columns + [metric_col]]
summary = summary.dropna()
for col in summary.columns:
    summary[col] = pd.to_numeric(summary[col])

formula = f"{metric_col} ~ {'*'.join(param_columns)}"

model = smf.ols(
    formula = formula, 
    data = summary)

fit_model = model.fit()

fit_model.summary()

run_cell = True
int_step = 2
float_delta = 0.1
float_steps = 4

if run_cell:
    params = {
        'n_estimators': [5, 10, 50, 100],
        'learning_rate': [0.001, 0.01, 0.1, 1, 10],
    }

    adaboost = AdaBoostClassifier(random_state = 0)

    print('Starting course search')
    clf = GridSearchCV(adaboost, params)
    search = clf.fit(x_train, y_train)
    print(f'Best params from course search: {search.best_params_}')

    fine_params = {}
    for param in params:
        if isinstance(search.best_params_[param], int):
            min_val = search.best_params_[param] - int_step
            max_val = search.best_params_[param] + int_step + 1
            fine_params[param] = [i for i in range(min_val, max_val)]
        else:
            min_val = search.best_params_[param]*(1 - float_delta)
            max_val = search.best_params_[param]*(1 + float_delta)
            fine_params[param] = np.linspace(min_val, max_val, float_steps)

    print('Starting fine search')
    clf = GridSearchCV(adaboost, fine_params, error_score=0)
    search = clf.fit(x_train, y_train)
    print(f'Best params from fine search: {search.best_params_}')

    clf = AdaBoostClassifier(random_state = 0, **search.best_params_)
    clf = clf.fit(x_train, y_train)
    clf.score(x_test, y_test)

    predictions = clf.predict(x_test)
    plot_confusion_matrix(y_test, predictions)

Starting course search
Best params from course search: {'learning_rate': 0.1, 'n_estimators': 100}
Starting fine search
Best params from fine search: {'learning_rate': 0.11000000000000001, 'n_estimators': 101}


## Neural Networks Accuracy

In [44]:
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

In [45]:
train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 809)
x_train, y_train = train[features], train[target]
x_test, y_test = test[features], test[target]


tf.random.set_seed(0)

model = Sequential()
model.add(Dense(9, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy'])

X, y = train[features].values, train[target].values
history = model.fit(X, y, 
          epochs = 100, 
          batch_size = 10, 
          verbose = 0,
          validation_split = 0.2);

plot_history(history)

NameError: name 'plot_history' is not defined

In [None]:
import pickle
from sklearn.ensemble import AdaBoostClassifier

adaboost_params = {
    'learning_rate': 0.01, 
    'n_estimators': 100,
}

clf = AdaBoostClassifier(random_state = 0, **adaboost_params)
clf = clf.fit(x_train, y_train)
with open('adaboost.pkl', 'wb') as f:
    pickle.dump(clf, f)
    

In [47]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Input

tf.random.set_seed(0)

model = Sequential()
model.add(Dense(9, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy'])

X, y = train[features].values, train[target].values
history = model.fit(X, y, 
          epochs = 100, 
          batch_size = 10, 
          verbose = 0,
          validation_split = 0.2);

model.save('neural.tf')
model2 = load_model('neural.tf')

INFO:tensorflow:Assets written to: neural.tf\assets


In [48]:
import requests

neural_url = 'http://127.0.0.1:5000/neural'
adaboost_url = 'http://127.0.0.1:5000/adaboost'

In [49]:
index = 100
data_dict = data.loc[index, features].to_dict()
data_dict

{'CreditScore': 639.0,
 'Age': 22.0,
 'Tenure': 4.0,
 'Balance': 0.0,
 'NumOfProducts': 2.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 0.0,
 'EstimatedSalary': 28188.96,
 'France': 1.0,
 'Germany': 0.0,
 'Spain': 0.0,
 'Female': 0.0,
 'Male': 1.0}

In [50]:
requests.post(neural_url, json = data_dict)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /neural (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000019CB5128790>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
response = requests.post(neural_url, json = data_dict)
response.json()

In [None]:
response = requests.post(neural_url, json = data_dict)
nn_prediction = response.json()

response = requests.post(adaboost_url, json = data_dict)
ab_prediction = response.json()

print(f'{index}: NN -> {nn_prediction}, Ada -> {ab_prediction}')   

In [None]:
prediction_comparison = {}
for index in data.index:
    if (index % 25) == 0:
        print(f'Starting index {index}')
    data_dict = data.loc[index, features].to_dict()

    response = requests.post(neural_url, json = data_dict)
    nn_prediction = response.json()

    response = requests.post(adaboost_url, json = data_dict)
    ab_prediction = response.json()
    
    prediction_comparison[index] = {
        'NN': nn_prediction,
        'AdaBoost': ab_prediction,
        'Actual': data.loc[index, target]
    }

prediction_comparison = pd.DataFrame().from_dict(prediction_comparison, orient = 'index')   

In [None]:
(prediction_comparison['NN'] == prediction_comparison['Actual']).sum()/len(data)

In [None]:
(prediction_comparison['AdaBoost'] == prediction_comparison['Actual']).sum()/len(data)