# Workflow for doing ML on cloud

1. Logistic regression
2. Random Forest/Decision Tree
     2.1. Decision Tree
     2.2. Random Forest
     2.3. Imbalanced Random Forest
     2.4. Best + Hyper parameterization
3. Feed Forward 

In [18]:
import pandas as pd
import numpy as np

In [20]:
!pip install --upgrade hyperas

Requirement already up-to-date: hyperas in /Users/dspaande/.local/lib/python3.8/site-packages (0.4.1)
Requirement already up-to-date: imbalanced-learn in /Users/dspaande/.local/lib/python3.8/site-packages (0.7.0)
Requirement already up-to-date: scikit-learn in /Users/dspaande/.local/lib/python3.8/site-packages (0.23.1)


In [22]:
import json

with open('../Data/Ongoing/result.json') as json_file:
    data = json.load(json_file)

In [23]:
data.keys()

dict_keys(['input_data', 'output_data', 'input_data_supplements', 'output_data_supplements', 'input_data_neural', 'output_data_neural'])

In [24]:
X = np.array(data['input_data_supplements'])
y = np.array(data['output_data_supplements'])
X_neural = np.array(data['input_data_neural'])
y_neural = np.array(data['output_data_neural'])

## Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, accuracy_score
from sklearn import metrics

In [26]:
x_train, x_test, y_train , y_test = train_test_split(X, y, 
                                                     test_size=0.2, random_state=0)

#Create model
logisticRegr = LogisticRegression(max_iter=1000000, random_state=0)

#Fit model
logisticRegr.fit(x_train, y_train)

#Predictions
predictions = logisticRegr.predict(x_test)

#Print
print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

[[527  22]
 [ 45  71]]


              precision    recall  f1-score   support

           0       0.92      0.96      0.94       549
           1       0.76      0.61      0.68       116

    accuracy                           0.90       665
   macro avg       0.84      0.79      0.81       665
weighted avg       0.89      0.90      0.89       665



Accuracy score: 89.92%
Balanced accuracy score : 78.60%


## Random Forest/Decision Tree

### Decision Tree

In [27]:
# With normal train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, accuracy_score
from collections import Counter

x_train, x_test, y_train , y_test = train_test_split(X, y, 
                                                     test_size=0.2, random_state=0)

dtree = DecisionTreeClassifier(random_state=0)
dtree.fit(x_train, y_train)
predictions = dtree.predict(x_test)

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

[[506  43]
 [ 43  73]]


              precision    recall  f1-score   support

           0       0.92      0.92      0.92       549
           1       0.63      0.63      0.63       116

    accuracy                           0.87       665
   macro avg       0.78      0.78      0.78       665
weighted avg       0.87      0.87      0.87       665



Accuracy score: 87.07%
Balanced accuracy score : 77.55%


In [28]:
# With stratified Kfold split
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=0)

dtree = DecisionTreeClassifier(random_state=0)

predictions = cross_val_predict(dtree, X, y, cv=skf, n_jobs=4)

print(confusion_matrix(y, predictions))
print('\n')
print(classification_report(y, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y, predictions)*100:.2f}%")

[[2593  177]
 [ 215  338]]


              precision    recall  f1-score   support

           0       0.92      0.94      0.93      2770
           1       0.66      0.61      0.63       553

    accuracy                           0.88      3323
   macro avg       0.79      0.77      0.78      3323
weighted avg       0.88      0.88      0.88      3323



Accuracy score: 88.20%
Balanced accuracy score : 77.37%


### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestClassifier(n_estimators=500, random_state=0)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
predictions_proba = rf.predict_proba(X_test)

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

[[524  25]
 [ 54  62]]


              precision    recall  f1-score   support

           0       0.91      0.95      0.93       549
           1       0.71      0.53      0.61       116

    accuracy                           0.88       665
   macro avg       0.81      0.74      0.77       665
weighted avg       0.87      0.88      0.87       665



Accuracy score: 88.12%
Balanced accuracy score : 74.45%


In [30]:
# With stratified Kfold split
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=0)

rf = RandomForestClassifier(n_estimators=500, random_state=0)

predictions = cross_val_predict(rf, X, y, cv=skf, n_jobs=4)

print(confusion_matrix(y, predictions))
print('\n')
print(classification_report(y, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y, predictions)*100:.2f}%")

[[2634  136]
 [ 253  300]]


              precision    recall  f1-score   support

           0       0.91      0.95      0.93      2770
           1       0.69      0.54      0.61       553

    accuracy                           0.88      3323
   macro avg       0.80      0.75      0.77      3323
weighted avg       0.88      0.88      0.88      3323



Accuracy score: 88.29%
Balanced accuracy score : 74.67%


### Imbalanced Random Forest

In [35]:
from imblearn.ensemble import BalancedRandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0, n_jobs=4)

brf.fit(X_train, y_train)
predictions = brf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

ModuleNotFoundError: No module named 'imblearn'

In [None]:
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(1)
plt.rcParams["figure.figsize"] = 10,4

x_p = np.linspace(-3,3, num=len(brf.feature_importances_))
y_p = brf.feature_importances_

fig, (ax,ax2) = plt.subplots(nrows=2, sharex=True)

extent = [x_p[0]-(x_p[1]-x_p[0])/2., x_p[-1]+(x_p[1]-x_p[0])/2.,0,1]
ax.imshow(y_p[np.newaxis,:], cmap="plasma", aspect="auto", extent=extent)
ax.set_yticks([])
ax.set_xlim(extent[0], extent[1])

ax2.plot(x_p,y_p)

plt.tight_layout()
plt.show()

### Best + Hyper parameterization

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'log2'],
    'max_depth' : [None],
    'criterion' :['gini', 'entropy'],
    'oob_score': [True, False],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

## Feed Forward Neural Network

### Shallow neural net

In [38]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
model = Sequential()
model.add(Dense(16, input_dim=len(X[0]), activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model.fit(X_train, y_train, epochs=100, batch_size=64)

In [None]:
predictions = model.predict(X_test)
predictions = np.ravel(predictions)
predictions = [0 if x < 0.5 else 1 for x in predictions]

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

### Somewhat deeper neural network

In [None]:
model = Sequential()
model.add(Dense(312, input_dim=len(X[0]), activation='relu'))
model.add(Dense(156, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model.fit(X_train, y_train, epochs=100, batch_size=64)

In [None]:
predictions = model.predict(X_test)
predictions = np.ravel(predictions)
predictions = [0 if x < 0.5 else 1 for x in predictions]

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
print('\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)*100:.2f}%')
print(f"Balanced accuracy score : {balanced_accuracy_score(y_test, predictions)*100:.2f}%")

### With parameter search

In [17]:
from __future__ import print_function
import numpy as np

from hyperopt import Trials, STATUS_OK, tpe
from keras.datasets import mnist
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

ModuleNotFoundError: No module named 'hyperopt'

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()
    model.add(Dense(512, input_shape=(1110,)))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([256, 512, 1024])}}))
    model.add(Activation({{choice(['relu', 'sigmoid'])}}))
    model.add(Dropout({{uniform(0, 1)}}))

    # If we choose 'four', add an additional fourth layer
    if {{choice(['three', 'four'])}} == 'four':
        model.add(Dense(100))

        # We can also choose between complete sets of layers

        model.add({{choice([Dropout(0.5), Activation('linear')])}})
        model.add(Activation('relu'))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'],
                  optimizer={{choice(['rmsprop', 'adam', 'sgd'])}})

    result = model.fit(x_train, y_train,
              batch_size={{choice([64, 128])}},
              epochs=2,
              verbose=2,
              validation_split=0.1)
    
    #get the highest validation accuracy of the training epochs
    predictions = model.predict(X_test)
    predictions = np.ravel(predictions)
    predictions = [0 if x < 0.5 else 1 for x in predictions]
    recall = classification_report(y_test, predictions, output_dict=True)["1"]["recall"]
    
    return {'loss': -recall, 'status': STATUS_OK, 'model': model}

In [34]:
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),
                                          notebook_name='ML_cloud_computing')

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import json
except:
    pass

try:
    from sklearn.linear_model import LogisticRegression
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, accuracy_score
except:
    pass

try:
    from sklearn import metrics
except:
    pass

try:
    from sklearn.tree import DecisionTreeClassifier
except:
    pass

try:
    from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, accuracy_score
except:
    pass

try:
    from collections import Counter
except:
    pass

try:
    from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold
except:
    pass

try:
    from sklearn.ensemble import RandomForestClassifier
except:
    pass

try:
    from imblearn.ens

TypeError: module, class, method, function, traceback, frame, or code object was expected, got dict

In [None]:
!python3 random.py