# MACHINE LEARNING MODEL FOR FLOW PATTERN CLASSIFICATION


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Database address, enter your own Drive address.
PATH_DATA = './BDOShohamIML.csv'
path = "./" #Path for metric images

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import time as tm

#Visualizers
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import matthews_corrcoef
from sklearn import model_selection

#Classifiers
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import warnings
warnings.filterwarnings('ignore')



In [4]:
def classifier_metrics():    
    def metrics(model):
        start_time = tm.time()
        model.fit(X_train, y_train)  # Fit the visualizer and the model
        TIME = tm.time() - start_time 
        print("Training Time: {0:.4f} [seconds]".format(TIME))

        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Prediction Time: {0:.4f} [seconds]".format(TIME))

        try: 
            y_prob = model.predict_proba(X_test)
            log_metric = log_loss(y_test,y_prob)
        except:
            y_prob = "Not probablistic"
            log_metric = 0 
        else:
            y_pred = model.predict(X_test)

        acc_score=accuracy_score(y_test,y_pred) 
        c_k_s=cohen_kappa_score(y_test,y_pred)
        zero_met=zero_one_loss(y_test,y_pred)
        hl=hamming_loss(y_test,y_pred)
        mc=matthews_corrcoef(y_test,y_pred)
        print('accuracy_score: {0:.4f}'.format(acc_score))
        print('cohen_kappa_score: {0:.4f}'.format(c_k_s))
        print('log_loss: {0:.4f}'.format(log_metric))
        print('zero_one_loss: {0:.4f}'.format(zero_met))
        print('hemming_loss: {0:.4f}'.format(hl))
        print('matthews_corrcoef: {0:.4f}'.format(mc))
 
    for name in classifiers:
        print (str(name))
        metrics(name)
        print()
        print ("---------------------------------------------------------------------------------\n") 

In [5]:
## Loading Data

# Velocity, Viscosity, Density, Surface Tension, Angle and Diameter
dataset = pd.DataFrame(pd.read_csv(PATH_DATA), columns=['Vsl', 'Vsg', 'VisL', 'VisG', 'DenL', 'DenG', 'ST', 'Ang', 'ID', 'Flow Pattern']) 

# Summarize the Dataset 
print("shape of initial data =",dataset.shape) 
# Class Distribution 
print(dataset.groupby('Flow Pattern').size()) 
# Leaving only the best training variables
dataset = dataset.drop(['VisG', 'VisL','DenG', 'ST', 'DenL'], axis=1) #Los quitamos
print("shape of selected data =",dataset.shape) 

print(dataset.head()) 

# Split-out validation dataset 
array = dataset.values 
X = array[:,0:4] #Data or features 
Y = array[:,4]   #Label or classes 
validation_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=validation_size, random_state=510) 

print("\ntrain data shape =",X_train.shape) 
print("train labels shape =",y_train.shape) 
print("test data shape =",X_test.shape) 
print("test labels shape =",y_test.shape) 


shape of initial data = (5675, 10)
Flow Pattern
A     1033
B      125
DB     594
I     2905
SS     140
SW     878
dtype: int64
shape of selected data = (5675, 5)
   Vsl    Vsg  Ang     ID Flow Pattern
0  6.3  0.025  0.0  0.051           DB
1  4.0  0.040  0.0  0.051           DB
2  6.3  0.040  0.0  0.051           DB
3  4.0  0.063  0.0  0.051           DB
4  6.3  0.063  0.0  0.051           DB

train data shape = (4540, 4)
train labels shape = (4540,)
test data shape = (1135, 4)
test labels shape = (1135,)


In [6]:
#classes 
classes = ['A', 'SS', 'DB', 'I', 'B', 'SW']


In [7]:
classifiers=[
ExtraTreesClassifier(n_estimators=112, max_depth=62, min_samples_split=5, random_state=28000001),
RandomForestClassifier(n_estimators=95,ccp_alpha=0.00046,criterion='entropy',random_state=277000000),
GradientBoostingClassifier(learning_rate=0.1,n_estimators=126,subsample=0.67,max_depth=4,max_features='log2',tol=0.013999999999999981),
svm.SVC(C = 59000, gamma = 0.0302),
DecisionTreeClassifier(random_state=1, min_samples_split=8),
KNeighborsClassifier(n_neighbors=1,weights= 'uniform',leaf_size=2,p=1, algorithm= 'auto', metric= 'minkowski'),
LogisticRegression(C=474, tol = 0.0),
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.000000786, store_covariance=False, tol=0.01),
GaussianNB(priors=None, var_smoothing=5e-9),
AdaBoostClassifier(n_estimators=6,learning_rate=0.96,algorithm='SAMME.R'),
LinearDiscriminantAnalysis(tol = 0.0, solver = 'lsqr', n_components=1, shrinkage='auto')
]

In [8]:
#Deploy aggregate metrics 
classifier_metrics() 

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=62, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=5,
                     min_weight_fraction_leaf=0.0, n_estimators=112,
                     n_jobs=None, oob_score=False, random_state=28000001,
                     verbose=0, warm_start=False)
Training Time: 0.3628 [seconds]
Prediction Time: 0.0359 [seconds]
accuracy_score: 0.9700
cohen_kappa_score: 0.9555
log_loss: 0.1661
zero_one_loss: 0.0300
hemming_loss: 0.0300
matthews_corrcoef: 0.9555

---------------------------------------------------------------------------------

RandomForestClassifier(bootstrap=True, ccp_alpha=0.00046, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       

## Cross Validation

In [9]:
X_train_ = np.concatenate([X_train,X_test],axis=0)
y_train_ = np.concatenate([y_train,y_test],axis=0)
print('Data shape:', X_train_.shape)
print('labels shape:', y_train_.shape) 

Data shape: (5675, 4)
labels shape: (5675,)


In [10]:
models = []
models.append(('ET', ExtraTreesClassifier(n_estimators=112, max_depth=62, min_samples_split=5, random_state=28000001)))
models.append(('RF', RandomForestClassifier(n_estimators=95,ccp_alpha=0.00046,criterion='entropy',random_state=277000000)))
models.append(('GB', GradientBoostingClassifier(learning_rate=0.1,n_estimators=126,subsample=0.67,max_depth=4,max_features='log2',tol=0.013999999999999981)))
models.append(('SVM', svm.SVC(C = 59000, gamma = 0.0302)))
models.append(('DT', DecisionTreeClassifier(random_state=1, min_samples_split=8)))
models.append(('KNN',KNeighborsClassifier(n_neighbors=1,weights= 'uniform',leaf_size=2,p=1, algorithm= 'auto', metric= 'minkowski')))
models.append(('LR', LogisticRegression(C=474, tol = 0.0)))
models.append(('QDA', QuadraticDiscriminantAnalysis(priors=None, reg_param=0.000000786, store_covariance=False, tol=0.01)))
models.append(('GNB', GaussianNB(priors=None, var_smoothing=5e-9)))
models.append(('AB', AdaBoostClassifier(n_estimators=6,learning_rate=0.96,algorithm='SAMME.R')))
models.append(('LDA', LinearDiscriminantAnalysis(tol = 0.0, solver = 'lsqr', n_components=1, shrinkage='auto')))

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    start_time = tm.time() 
    cv_results = model_selection.cross_val_score(model, X_train_, y_train_, cv=kfold, scoring='accuracy')
    TIME = tm.time() - start_time 
    print("Time: {0:.4f} [seconds]".format(TIME))
    results.append(cv_results)
    names.append(name) 
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg) 

Time: 3.9204 [seconds]
ET: 0.957892 (0.012789)
Time: 8.4658 [seconds]
RF: 0.948904 (0.006740)
Time: 33.0847 [seconds]
GB: 0.949783 (0.008025)
Time: 29.6094 [seconds]
SVM: 0.934456 (0.011644)
Time: 0.1748 [seconds]
DT: 0.919299 (0.008334)
Time: 0.2774 [seconds]
KNN: 0.887399 (0.011169)
Time: 2.9405 [seconds]
LR: 0.660615 (0.015551)
Time: 0.0830 [seconds]
QDA: 0.684233 (0.027851)
Time: 0.0864 [seconds]
GNB: 0.673658 (0.021277)
Time: 0.6637 [seconds]
AB: 0.663610 (0.016622)
Time: 0.2233 [seconds]
LDA: 0.650044 (0.008242)
