# THE BEST MODEL FOR FLOW PATTERNS CLASSIFICATION

In [None]:
data_path = '../Databases/ShohamDB.csv' 
figures_path = './figures' 

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import time

#Visualizers
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import matthews_corrcoef

#Classifier
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
if not os.path.exists(figures_path):
    os.makedirs(figures_path)

In [None]:
#Confusion matrix
def CM_viz():    
    my_title = "Confusion Matrix"
    def CM(model,classes):
        visualizer = ConfusionMatrix(model, classes=classes,percent=True, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        for label in visualizer.ax.texts:
            label.set_size(16)     
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        CM(classifier,classes)
        fig.savefig(figures_path+"/"+name+"_CM.pdf")

# Classification report
def CR_viz():
    my_title = "Classification Report"
    def Class_report(model,classes):
        visualizer = ClassificationReport(model, classes=classes, support=True, title=my_title)
        train_start_time = time.time()
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        print(f'Train runtime: {time.time()-train_start_time}')
        test_start_time = time.time()
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        print(f'Test runtime: {time.time()-test_start_time}')
        for label in visualizer.ax.texts:
            label.set_size(16) 
        return visualizer.poof()
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        Class_report(classifier,classes)
        fig.savefig(figures_path+"/"+name+"_CR.pdf")

#Class Prediction Error
def CPE_viz():    
    my_title = "Class Prediction Error"
    def CPE(model,classes):
        visualizer = ClassPredictionError(model, classes=classes, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        for label in visualizer.ax.texts:
            label.set_size(16) 
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        CPE(classifier,classes)
        fig.savefig(figures_path+"/"+name+"_CPE.pdf")
                
#ROC-AUC
def ROC_viz():    
    my_title = "ROC Curves"
    def ROC(model,classes):
        visualizer = ROCAUC(model, classes=classes, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        for label in visualizer.ax.texts:
            label.set_size(16) 
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        ROC(classifier,classes)
        fig.savefig(figures_path+"/"+name+"_ROC.pdf")


In [None]:
def classifier_metrics():    
    def metrics(model):
        start_time = time.time()
        model.fit(X_train, y_train)  # Fit the visualizer and the model
        TIME = time.time() - start_time 
        print("Training Time: {0:.4f} [seconds]".format(TIME))

        start_time = time.time()
        y_pred = model.predict(X_test)
        TIME = time.time() - start_time 
        print("Prediction Time: {0:.4f} [seconds]".format(TIME))

        try: 
            y_prob = model.predict_proba(X_test)
            log_metric = log_loss(y_test,y_prob)
        except:
            y_prob = "Not probablistic"
            log_metric = 0 
        else:
            y_pred = model.predict(X_test)

        acc_score=accuracy_score(y_test,y_pred) 
        c_k_s=cohen_kappa_score(y_test,y_pred)
        zero_met=zero_one_loss(y_test,y_pred)
        hl=hamming_loss(y_test,y_pred)
        mc=matthews_corrcoef(y_test,y_pred)
        print('accuracy_score: {0:.4f}'.format(acc_score))
        print('cohen_kappa_score: {0:.4f}'.format(c_k_s))
        print('log_loss: {0:.4f}'.format(log_metric))
        print('zero_one_loss: {0:.4f}'.format(zero_met))
        print('hemming_loss: {0:.4f}'.format(hl))
        print('matthews_corrcoef: {0:.4f}'.format(mc))
 
    for name in classifiers:
        print (str(name))
        metrics(name)
        print()
        print ("---------------------------------------------------------------------------------\n") 

In [None]:
## Loading Data

# Velocity, Viscosity, Density, Surface Tension, Angle and Diameter
dataset = pd.DataFrame(pd.read_csv(data_path), columns=['Vsl', 'Vsg', 'VisL', 'VisG', 'DenL', 'DenG', 'ST', 'Ang', 'ID', 'Flow Pattern']) 

# Summarize the Dataset 
print("shape of initial data =",dataset.shape) 
# Class Distribution 
print(dataset.groupby('Flow Pattern').size()) 
# Leaving only the best training variables
dataset = dataset.drop(['VisG', 'VisL','DenG', 'ST', 'DenL'], axis=1) #Delete this variables
print("shape of selected data =",dataset.shape) 

print(dataset.head()) 

# Split-out validation dataset 
array = dataset.values 
X = array[:,0:4] #Data or features 
Y = array[:,4]   #Label or classes 
validation_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=validation_size, random_state=510) 

print("\ntrain data shape =",X_train.shape) 
print("train labels shape =",y_train.shape) 
print("test data shape =",X_test.shape) 
print("test labels shape =",y_test.shape) 


In [None]:
#classes 
classes = ['A', 'SS', 'DB', 'I', 'B', 'SW'] 

# select classifiers
classifiers=[
ExtraTreesClassifier(n_estimators=112, max_depth=31, min_samples_split=5, random_state=28000001)
]

names=['ET'] 

In [None]:
visualization =[CM_viz(),CR_viz(),CPE_viz(),ROC_viz()] 
classifier_metrics() 