# MACHINE LEARNING MODELS FOR MPF CLASSIFICATION



In [None]:
## Database address, enter your own address.
data_path = '../Databases/12DB_6FP.csv' 
figures_path = './figures_unbalanced' 

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import time as tm
from sklearn.utils import resample

#Visualizers
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import matthews_corrcoef
from sklearn import model_selection

#Classifiers
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import warnings
warnings.filterwarnings('ignore')

In [None]:
if not os.path.exists(figures_path):
    os.makedirs(figures_path) 

# Metrics

In [None]:
def classifier_metrics():  
    def cross_validation(model,X_train,X_test,y_train,y_test,n_splits=10):
        global TIME3
        X_train_ = np.concatenate([X_train,X_test],axis=0)
        y_train_ = np.concatenate([y_train,y_test],axis=0)
        kfold = model_selection.KFold(n_splits=n_splits, random_state=0)
        start_time = tm.time() 
        cv_results = model_selection.cross_val_score(model, X_train_, y_train_, cv=kfold, scoring='accuracy')
        TIME3 = tm.time() - start_time 
        print('cross_validation_score: {:.3f} ± {:.3f}'.format(cv_results.mean(), cv_results.std()))

    def metrics(model):
        start_time = tm.time()
        model.fit(X_train, y_train)  # Fit the visualizer and the model
        TIME1 = tm.time() - start_time 
        print("Training Time: {0:.3f} [seconds]".format(TIME1))

        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME2 = tm.time() - start_time 
        print("Prediction Time: {0:.3f} [seconds]\n".format(TIME2))

        try: 
            y_prob = model.predict_proba(X_test)
            log_metric = log_loss(y_test,y_prob)
        except:
            y_prob = "Not probablistic"
            log_metric = 0 
        else:
            y_pred = model.predict(X_test)

        ###Set of metrics
        acc_score=accuracy_score(y_test,y_pred) 
        c_k_s=cohen_kappa_score(y_test,y_pred)
        zero_met=zero_one_loss(y_test,y_pred)
        hl=hamming_loss(y_test,y_pred)
        mc=matthews_corrcoef(y_test,y_pred)
        print('accuracy_score: {0:.3f}'.format(acc_score))
        print('cohen_kappa_score: {0:.3f}'.format(c_k_s))
        print('log_loss: {0:.3f}'.format(log_metric))
        print('zero_one_loss: {0:.3f}'.format(zero_met))
        print('hemming_loss: {0:.3f}'.format(hl))
        print('matthews_corrcoef: {0:.3f}'.format(mc))
        
        ###Cross validation
        cross_validation(model,X_train,X_test,y_train,y_test,n_splits=10)

        ###Run time
        print("run_time: {0:.3f} [seconds]".format(TIME1+TIME2+TIME3))
 
    for name in classifiers:
        print (str(name))
        metrics(name)
        print()
        print ("---------------------------------------------------------------------------------\n") 

In [None]:
#Confusion matrix
def CM_viz():    
    my_title = "Confusion Matrix"
    def CM(model,classes):
        visualizer = ConfusionMatrix(model, classes=classes,percent=True, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        CM(classifier,classes)
        fig.savefig(figures_path+name+"_CM.pdf", bbox_inches='tight')

# Classification report
def CR_viz():
    my_title = "Classification Report"
    def Class_report(model,classes):
        visualizer = ClassificationReport(model, classes=classes, support=True, title=my_title)
        train_start_time = tm.time()
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        print(f'Train runtime: {tm.time()-train_start_time}')
        test_start_time = tm.time()
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        print(f'Test runtime: {tm.time()-test_start_time}')
        return visualizer.poof()
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        Class_report(classifier,classes)
        fig.savefig(figures_path+name+"_CR.pdf", bbox_inches='tight')

#Class Prediction Error
def CPE_viz():    
    my_title = "Class Prediction Error"
    def CPE(model,classes):
        visualizer = ClassPredictionError(model, classes=classes, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        CPE(classifier,classes)
        fig.savefig(figures_path+name+"_CPE.pdf", bbox_inches='tight')
                
#ROC-AUC
def ROC_viz():    
    my_title = "ROC Curves"
    def ROC(model,classes):
        visualizer = ROCAUC(model, classes=classes, title=my_title)
        visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
        return visualizer.poof()  
    for name, classifier in zip(names, classifiers):
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        ROC(classifier,classes)
        fig.savefig(figures_path+name+"_ROC.pdf", bbox_inches='tight')

# Carga de la base de datos

In [None]:
filename = data_path
data = pd.read_csv(filename)

In [None]:
data
#np.set_printoptions(suppress=True) 

In [None]:
print(data.groupby('FlowPattern').size()) 
print("data shape =",data.shape) 

In [None]:
fig, ax = plt.subplots()
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['text.color'] = '#909090'
plt.rcParams['axes.labelcolor']= '#909090'
plt.rcParams['xtick.color'] = '#909090'
plt.rcParams['ytick.color'] = '#909090'
plt.rcParams['font.size']=12
cantidades = [816,582,1093,1664,153,4721]
nombres = ["DB","SS","SW","A","B","I"]
color_palette_list = ['#009ACD', '#ADD8E6', '#63D1F4', '#0EBFE9',   
                      '#C1F0F6', '#0099CC']
ax.pie(cantidades, labels=nombres,  
       colors=color_palette_list[0:], autopct='%1.0f%%', 
       shadow=False, startangle=0,   
       pctdistance=1.2,labeldistance=1.4)
ax.axis('equal')
ax.set_ylabel("Flow Pattern Dataset")
ax.legend(frameon=False, bbox_to_anchor=(1.3,0.8))
ax.figure.subplots_adjust(right=0.8)
ax.figure.savefig(figures_path+"/data_distribution_12DB.pdf")

In [None]:
patterns_labels = ["DB","SS","SW","A","I","B"]
correct_Pattern_labels = data['FlowPattern'].values
patterns_counts = data['FlowPattern'].value_counts().sort_index()
patterns_counts.index = patterns_labels
patterns_counts.plot(kind='bar',color=color_palette_list,
                         title='Original data: distribution of data by classes')
patterns_counts
plt.savefig(figures_path+"/original_data.pdf", bbox_inches='tight')
plt.show()

In [None]:
original_data = patterns_counts

In [None]:
original_data

In [None]:
data
#np.set_printoptions(suppress=True) 

# How to try imbalance data
1. Up-sample Minority Class
Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal.

There are several heuristics for doing so, but the most common way is to simply resample with replacement.

First, we'll import the resampling module from Scikit-Learn:

Next, we'll create a new DataFrame with an up-sampled minority class. Here are the steps:

First, we'll separate observations from each class into different DataFrames.
Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.

In [None]:
print(data.groupby('FlowPattern').size()) 
print("data shape =",data.shape) 

In [None]:
def up_sample_minority_class(data,name_column_labels = "FlowPattern"):
    num_data_majority = data[data.FlowPattern==0].shape[0] #Se elige la primera posición como la menor 
    pos = 0
    for i in range(len(data.groupby(name_column_labels))):
        if data[data.FlowPattern==i].shape[0] > num_data_majority:
          num_data_majority = data[data.FlowPattern==i].shape[0] 
          pos = i

    data_majority = data[data.FlowPattern==pos]
    data_downsampled = resample(data[data.FlowPattern==0], replace=True, n_samples=num_data_majority, random_state=64)
    for i in range(len(data.groupby(name_column_labels))):
        if i != pos and i>0:
          data_majority_downsampled = resample(data[data.FlowPattern==i], replace=True, n_samples=num_data_majority, random_state=64)
          data_downsampled = pd.concat([data_downsampled,data_majority_downsampled])
        elif i == pos:
          data_downsampled = pd.concat([data_downsampled,data_majority])
    return data_downsampled


data_upsampled = up_sample_minority_class(data,name_column_labels = "FlowPattern")
# Display new class counts
data_upsampled.FlowPattern.value_counts()

In [None]:
patterns_labels = ["DB","SS","SW","A","I","B"]
correct_Pattern_labels = data_upsampled['FlowPattern'].values
patterns_counts = data_upsampled['FlowPattern'].value_counts().sort_index()
patterns_counts.index = patterns_labels
patterns_counts.plot(kind='bar',color=color_palette_list,
                         title='Upsampled data: distribution of data by classes')
patterns_counts
plt.savefig(figures_path+"/upsampled_data.pdf", bbox_inches='tight')
plt.show() 

In [None]:
data_up = patterns_counts
data_up

In [None]:
data_upsampled 

In [None]:
# Separate input features (X) and target variable (y)
y = data_upsampled.FlowPattern
X = data_upsampled.drop(columns=['FlowPattern']) 
X.describe() 

In [None]:
print(y)
print(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=64) 

## Machine Learning Algorithms

In [None]:
######################### BEST MODELS SELECTED FOR UP-SAMPLING
classifiers=[
ExtraTreesClassifier(random_state=121,n_estimators=100, max_depth=42),
RandomForestClassifier(random_state=16, max_depth = 35),
GradientBoostingClassifier(max_depth=10,n_estimators=111,random_state=58)
]

#Deploy aggregate metrics 
classifier_metrics() 

# How to try imbalance data
2. Down-sample Majority Class
Down-sampling involves randomly removing observations from the majority class to prevent its signal from dominating the learning algorithm.

The most common heuristic for doing so is resampling without replacement.

The process is similar to that of up-sampling. Here are the steps:

First, we'll separate observations from each class into different DataFrames.
Next, we'll resample the majority class without replacement, setting the number of samples to match that of the minority class.
Finally, we'll combine the down-sampled majority class DataFrame with the original minority class DataFrame.

In [None]:
print(data.groupby('FlowPattern').size()) 
print("data shape =",data.shape) 

In [None]:
def dowm_sample_majority_class(data,name_column_labels = "FlowPattern"):
    num_data_minority = data[data.FlowPattern==0].shape[0] #Se elige la primera posición como la menor #Buscar la clase con menos datos
    pos = 0
    for i in range(len(data.groupby(name_column_labels))):
        if data[data.FlowPattern==i].shape[0] < num_data_minority:
          num_data_minority = data[data.FlowPattern==i].shape[0] 
          pos = i

    data_minority = data[data.FlowPattern==pos]
    data_downsampled = resample(data[data.FlowPattern==0], replace=False, n_samples=num_data_minority, random_state=64)
    for i in range(len(data.groupby(name_column_labels))):
        if i != pos and i>0:
          data_majority_downsampled = resample(data[data.FlowPattern==i], replace=False, n_samples=num_data_minority, random_state=64)
          data_downsampled = pd.concat([data_downsampled,data_majority_downsampled])
        elif i == pos:
          data_downsampled = pd.concat([data_downsampled,data_minority])
    return data_downsampled


data_downsampled = dowm_sample_majority_class(data,name_column_labels = "FlowPattern")
# Display new class counts
data_downsampled.FlowPattern.value_counts()

In [None]:
patterns_labels = ["DB","SS","SW","A","I","B"]
correct_Pattern_labels = data_downsampled['FlowPattern'].values
patterns_counts = data_downsampled['FlowPattern'].value_counts().sort_index()
patterns_counts.index = patterns_labels
patterns_counts.plot(kind='bar',color=color_palette_list,
                         title='Downsampled data: distribution of data by classes')
patterns_counts
plt.savefig(figures_path+"/downsampled_data.pdf", bbox_inches='tight')
plt.show() 

In [None]:
data_down = patterns_counts
data_down

In [None]:
data_downsampled

In [None]:
# Separate input features (X) and target variable (y)
y = data_downsampled.FlowPattern
X = data_downsampled.drop(columns=['FlowPattern']) 
X.describe() 

In [None]:
print(y)
print(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=64) 

## Machine Learning Algorithms

In [None]:
######################### BEST MODELS SELECTED FOR DOWNSAMPLING
classifiers=[
ExtraTreesClassifier(random_state=67, n_estimators=62, max_depth=32),
RandomForestClassifier(random_state=37, max_depth=22),
GradientBoostingClassifier(max_depth=4,n_estimators=148,random_state=10),
]

#Deploy aggregate metrics 
classifier_metrics() 

In [None]:
color_palette_list = ['#009ACD', '#ADD8E6', '#63D1F4', '#0EBFE9',   
                      '#C1F0F6', '#0099CC']

In [None]:
# GRÁFICO DE LAS 3 FORMAS.
plt.figure()

patterns_labels = ["DB","SS","SW","A","I","B"]

data_up.index = patterns_labels

data_up.plot(kind='bar',color='#C1F0F6',label='Up-sampled')

original_data.index = patterns_labels
original_data.plot(kind='bar',color='#0EBFE9', label='Original')

data_down.index = patterns_labels
data_down.plot(kind='bar',color='#009ACD', label='Down-sampled')

plt.legend(loc='center left', bbox_to_anchor=(0.14, 1.05), shadow=True, ncol=3)

plt.savefig(figures_path+"/Fig3.pdf", bbox_inches='tight')
plt.savefig(figures_path+"/Fig3.svg", bbox_inches='tight')
plt.show()

In [None]:
# GRÁFICO DE LAS 3 FORMAS.
plt.figure()

patterns_labels = ["DB","SS","SW","A","I","B"]

data_up.index = patterns_labels

data_up.plot(kind='bar',color="yellow",label='Up-sampled')

original_data.index = patterns_labels
original_data.plot(kind='bar',color="blue", label='Original')

data_down.index = patterns_labels
data_down.plot(kind='bar',color="red", label='Down-sampled')

plt.legend(loc='center left', bbox_to_anchor=(0.14, 1.05), shadow=True, ncol=3)

plt.savefig(figures_path+"/Fig3.pdf", bbox_inches='tight')
plt.savefig(figures_path+"/Fig3.svg", bbox_inches='tight')
plt.show()

# Imbalanced data using smote
## Synthetic Minority Over-sampling Technique
---



In [None]:
#https://towardsdatascience.com/how-to-effortlessly-handle-class-imbalance-with-python-and-smote-9b715ca8e5a7

In [None]:
print(data.groupby('FlowPattern').size()) 
print("data shape =",data.shape) 

In [None]:
# Separate input features (X) and target variable (y)
y = data.FlowPattern
X = data.drop(columns=['FlowPattern']) 
X.describe() 

In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=45)

X_sm, y_sm = sm.fit_resample(X, y)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')


In [None]:
y_sm.shape

In [None]:
#import matplotlib.pyplot as plt
_ = plt.hist(y_sm, bins='auto')  # arguments are passed to np.histogram
plt.title("Histogram for data using SMOTE")
plt.show() 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=64) 

In [None]:
#BEST CLASSIFIERS SELECTED FOR SMOTE
classifiers=[
ExtraTreesClassifier(random_state=24,n_estimators=42),
RandomForestClassifier(random_state=83,max_depth=32),
GradientBoostingClassifier(max_depth=10, n_estimators=140, random_state=4)
]

#Deploy aggregate metrics 
classifier_metrics() 


# Imbalanced data using adasyn

In [None]:
from imblearn.over_sampling import ADASYN

X_adasyn, y_adasyn = ADASYN().fit_sample(X, y)

print(f'''Shape of X before ADASYN: {X.shape}
Shape of X after ADASYN: {X_adasyn.shape}''')


In [None]:
y_sm.shape

In [None]:
#import matplotlib.pyplot as plt
_ = plt.hist(y_sm, bins='auto')  # arguments are passed to np.histogram
plt.title("Histogram for data using ADASYN")
plt.show() 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=64) 

In [None]:
classifiers=[
ExtraTreesClassifier(random_state=97,n_estimators=94),
RandomForestClassifier(random_state=18,max_depth = 35),
GradientBoostingClassifier(max_depth=9,n_estimators=120,random_state=74)
]

#Deploy aggregate metrics 
classifier_metrics() 
