# MACHINE LEARNING MODELS FOR FLOW PATTERNS CLASSIFICATION


In [None]:
PATH_DATA = '../Databases/ShohamDB.csv' 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import time as tm

#Visualizers
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import matthews_corrcoef
from sklearn import model_selection

#Classifiers
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
def classifier_metrics():    
    def metrics(model):
        start_time = tm.time()
        model.fit(X_train, y_train)  # Fit the visualizer and the model
        TIME = tm.time() - start_time 
        print("Training Time: {0:.4f} [seconds]".format(TIME))

        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Prediction Time: {0:.4f} [seconds]".format(TIME))

        try: 
            y_prob = model.predict_proba(X_test)
            log_metric = log_loss(y_test,y_prob)
        except:
            y_prob = "Not probablistic"
            log_metric = 0 
        else:
            y_pred = model.predict(X_test)

        acc_score=accuracy_score(y_test,y_pred) 
        c_k_s=cohen_kappa_score(y_test,y_pred)
        zero_met=zero_one_loss(y_test,y_pred)
        hl=hamming_loss(y_test,y_pred)
        mc=matthews_corrcoef(y_test,y_pred)
        print('accuracy_score: {0:.4f}'.format(acc_score))
        print('cohen_kappa_score: {0:.4f}'.format(c_k_s))
        print('log_loss: {0:.4f}'.format(log_metric))
        print('zero_one_loss: {0:.4f}'.format(zero_met))
        print('hemming_loss: {0:.4f}'.format(hl))
        print('matthews_corrcoef: {0:.4f}'.format(mc))
 
    for name in classifiers:
        print (str(name))
        metrics(name)
        print()
        print ("---------------------------------------------------------------------------------\n") 

In [None]:
## Loading Data

# Velocity, Viscosity, Density, Surface Tension, Angle and Diameter
dataset = pd.DataFrame(pd.read_csv(PATH_DATA), columns=['Vsl', 'Vsg', 'VisL', 'VisG', 'DenL', 'DenG', 'ST', 'Ang', 'ID', 'Flow Pattern']) 

# Summarize the Dataset 
print("shape of initial data =",dataset.shape) 
# Class Distribution 
print(dataset.groupby('Flow Pattern').size()) 
# Leaving only the best training variables
dataset = dataset.drop(['VisG', 'VisL','DenG', 'ST', 'DenL'], axis=1) #Delete this variables
print("shape of selected data =",dataset.shape) 

print(dataset.head()) 

# Split-out validation dataset 
array = dataset.values 
X = array[:,0:4] #Data or features 
Y = array[:,4]   #Label or classes 
validation_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=validation_size, random_state=510) 

print("\ntrain data shape =",X_train.shape) 
print("train labels shape =",y_train.shape) 
print("test data shape =",X_test.shape) 
print("test labels shape =",y_test.shape) 


In [None]:
#classes 
classes = ['A', 'SS', 'DB', 'I', 'B', 'SW']


In [None]:
classifiers=[
ExtraTreesClassifier(n_estimators=112, max_depth=31, min_samples_split=5, random_state=28000001),
RandomForestClassifier(n_estimators=95,ccp_alpha=0.00046,criterion='entropy',random_state=277),
svm.SVC(C = 59000, gamma = 0.0302),
GradientBoostingClassifier(learning_rate=0.1,n_estimators=126,max_depth=4),
DecisionTreeClassifier(random_state=1, min_samples_split=8),
KNeighborsClassifier(n_neighbors=1,weights= 'uniform',leaf_size=2,p=1, metric= 'minkowski'),
QuadraticDiscriminantAnalysis(reg_param=786e-9),
GaussianNB(var_smoothing=5e-9),
AdaBoostClassifier(n_estimators=6,learning_rate=0.96,algorithm='SAMME.R')
]


In [None]:
#Deploy aggregate metrics 
classifier_metrics() 

## Cross Validation

In [None]:
X_train_ = np.concatenate([X_train,X_test],axis=0)
y_train_ = np.concatenate([y_train,y_test],axis=0)
print('Data shape:', X_train_.shape)
print('labels shape:', y_train_.shape) 

In [None]:
models = []
models.append(('ET', ExtraTreesClassifier(n_estimators=112, max_depth=31, min_samples_split=5, random_state=28000001)))
models.append(('RF', RandomForestClassifier(n_estimators=95,ccp_alpha=0.00046,criterion='entropy',random_state=277)))
models.append(('SVM', svm.SVC(C = 59000, gamma = 0.0302)))
models.append(('GB', GradientBoostingClassifier(learning_rate=0.1,n_estimators=126,max_depth=4)))
models.append(('DT', DecisionTreeClassifier(random_state=1, min_samples_split=8)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=1,weights= 'uniform',leaf_size=2,p=1, metric= 'minkowski')))
models.append(('QDA', QuadraticDiscriminantAnalysis(reg_param=786e-9)))
models.append(('GNB', GaussianNB(var_smoothing=5e-9)))
models.append(('AB', AdaBoostClassifier(n_estimators=6,learning_rate=0.96,algorithm='SAMME.R')))

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    start_time = tm.time() 
    cv_results = model_selection.cross_val_score(model, X_train_, y_train_, cv=kfold, scoring='accuracy')
    TIME = tm.time() - start_time 
    print("Time: {0:.4f} [seconds]".format(TIME))
    results.append(cv_results)
    names.append(name) 
    msg = "%s: %f (%f)\n" % (name, cv_results.mean(), cv_results.std())
    print(msg) 