In [197]:
import os
from os import path
import pandas as pd
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from itertools import product
import numpy as np
warnings.filterwarnings('ignore')
sns.set_theme(style="white")


from sklearn.preprocessing import  OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score


In [5]:
path = 'PAN/Data/'
paths = [path +csv for csv in os.listdir(path)]
CLEAN = [path for path in paths if 'CLN' in path]

In [29]:
data = pd.read_csv(CLEAN[0])
FEATS = list(data.columns[:9])
FEATS.remove('i_qnt')
LABELS = ['0.5','0.75']

In [15]:
HALF = [path for path in CLEAN if "_50_" in path]
SEVE = [path for path in CLEAN if "_75_" in path]
NINE = [path for path in CLEAN if "_90_" in path]

The next line will verify whether all the columns are in the same order across all DataFrames

In [26]:
FEATS_TABLE = pd.read_csv(HALF[0])[FEATS]

flag = True
for path in CLEAN:
    data = pd.read_csv(path)[FEATS]
    for i in range(data.shape[0]):
        if not data.iloc[i].equals(FEATS_TABLE.iloc[i]): 
            flag = False
flag

True

In [74]:
path_50 = 'PAN/ALL/CLN_HLT_ALL_50_qnt.csv'

if not path.exists(path_50):
    
    LABELS_HALF = [label for label in LABELS]*3
    LABELS_HALF = [label+'_'+ds[-14:-11] for label,ds in zip(LABELS_HALF,HALF*3)]
    
    CLN_HLT_ALL_50_qnt = pd.DataFrame(columns=FEATS+LABELS_HALF)
    TTI_50 = pd.read_csv(HALF[0])
    TTO_50 = pd.read_csv(HALF[1])
    WOP_50 = pd.read_csv(HALF[2])
    
    for i in range(TTI_50.shape[0]):
        FEATS_ = {feat:value for feat,value in zip(FEATS, TTI_50[FEATS].iloc[i])}
        LABELS_ = {}
        for label in LABELS_HALF:
            if "TTI" in label:
                LABELS_[label] = TTI_50[label[:-4]].iloc[i]
            elif "TTO" in label:
                LABELS_[label] = TTO_50[label[:-4]].iloc[i]
            elif "WOP" in label:
                LABELS_[label] = WOP_50[label[:-4]].iloc[i]

        FEATS_.update(LABELS_)
        CLN_HLT_ALL_50_qnt.loc[i] = FEATS_


    CLN_HLT_ALL_50_qnt.to_csv(path)

In [77]:
ohe = OneHotEncoder()
ohe_vector = ohe.fit_transform(CLN_HLT_ALL_50_qnt[LABELS_HALF]).toarray()

X_train, X_test, y_train, y_test = train_test_split(CLN_HLT_ALL_50_qnt[FEATS], ohe_vector)

In [86]:
#########################################
########## KNN CLASSIFIER ###############
#########################################

clf = KNeighborsClassifier(n_neighbors=7, algorithm='ball_tree')
m_clf = MultiOutputClassifier(clf,n_jobs=-1).fit(X_train, y_train)

y_pred = m_clf.predict(X_test)
print('*'*10,'\nKNN  SCORES\n','*'*10)
ac = accuracy_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test,average='micro')
rec = recall_score(y_pred,y_test,average='micro')
prec = precision_score(y_pred,y_test,average='micro')

print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')

********** 
KNN  SCORES
 **********
ACCURACY  : 0.6245695592286501
F1 SCORE  : 0.8763495975745413
RECALL    : 0.888304222860933
PRECISION : 0.8647124655647382



In [183]:
class Pipeline_multioutput():
    """
    Class which implements the usage of a single pipeline with 
    one estimator on scikit-learn's MultioutputClassifier.
    """
    
    def __init__(self, estimator):
        """
        Params:
            estimator(sklearn estimator): Estimator on which the pipeline will be
                                done
        """
        self.estimator = estimator
        self.fit_done = False
    
    def set_params(self,params,njobs=-1):
        
        """
        Function which will set the class' estimator with the multioutput classifier.
        ----------------------------------------------------------
        Params:
            params (dict): Parameters for the class' estimator.
            njobs (int): The number of jobs to run in parallel.
        """
        self.params = params
        clf = self.estimator(**self.params)
        m_clf = MultiOutputClassifier(clf,njobs)
        self.classifier = m_clf
    
    def fit(self, X,y):
        """
        Function which will fit the estimator with the multioutput classifier
        in which X is the input and y is the desired output.
        If no parameters had been set before (by using self.set_params)
        the default parameters for the estimator will be set.
        ----------------------------------------------------------
        Params:
            X (array): Input Data.
            Y (array): Target value.
            
        """
        try:
            self.classifier
        except Exception:
            try:
                clf = self.estimator()
                m_clf = MultiOutputClassifier(clf)
                self.classifier = m_clf
            except Exception:
                raise Exception('Parameters for {} must be set before'.format(self.estimator))
        
        self.classifier.fit(X_train,y_train)
        self.fit_done = True
        
    def predict(self,X):
        """
        Predict multi-output variable using a model trained.
        ------------------------------------------------------
        Params:
            X(array): Input Data.
        Returns:
            (array): Predicted values.
        """
        if not self.fit_done:
            raise Exception('The model has not been fitted.')
        return self.classifier.predict(X)
    
    def score(self,X,y,verbose=False):
        """
        Scoring for the model.
        -------------------------------
        Params:
            X (array): Input Data.
            Y (array): Target value.
            verbose(bool): If set True, each score will be shown.
        Returns:
            scores(tuple): Accuracy, F1-score, Recall and Precision
        """
        y_pred = self.predict(X)
        ac = accuracy_score(y_pred,y)
        f1 = f1_score(y_pred,y,average='micro')
        rec = recall_score(y_pred,y,average='micro')
        prec = precision_score(y_pred,y,average='micro')
        if verbose:
            print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
        
        scores = {'accuracy':ac,'f1':f1,'recall':rec,'precision':prec,'average':sum([ac,f1,rec,prec])/4}
        return scores
    
    def get_params(self):
        return self.classifier.get_params()
    
    def GridSearch(self,params,X_train,y_train,X_test,y_test,scoring='accuracy',njobs=-1,verbose=False):
        """
        Gridsearch on specified parameter space.
        -----------------------------------------
        Params:
            params(dict): Dictionary ({param:values}) where vaules is a list containing the parameters on
                         which the gridsearch will be done.
            X_train(array): Input Data for training.
            y_train(array): Target values for training.
            X_test(array): Input Data for testing.
            y_test(array): Target values for testing.
            scoring(str): Scoring method for evaluating the model. Can be choosen between: 'accuracy',
                         'f1', 'recall', 'precision' and 'average'. (Default = accuracy)
            njobs (int): The number of jobs to run in parallel.(Default=-1)
        """
        best_score = 0
        params_comb = product(*params.values())
        for values in params_comb:
            try:
                param_dict = {param:value for param,value in zip(params.keys(),values)}

                self.set_params(param_dict,njobs)
                self.fit(X_train,y_train)
                score = self.score(X_test,y_test)[scoring]
                if verbose:
                    print(param_dict)
                    print(f'{scoring} Score:{score}')
                if score > best_score:
                    best_params = self.get_params()
                    best_score = score
            except Exception:
                continue
            
        return best_params, best_score
                

Best params for KNN:

In [184]:
params = {'n_neighbors':[2,3,4,5,6,7,8,9,10], 'algorithm':['ball_tree','kd_tree'], 'metric':['hamming','minkowski']}
KNN = Pipeline_multioutput(KNeighborsClassifier)
KNN_params = KNN.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average')

In [186]:
KNN_params

({'estimator__algorithm': 'ball_tree',
  'estimator__leaf_size': 30,
  'estimator__metric': 'minkowski',
  'estimator__metric_params': None,
  'estimator__n_jobs': None,
  'estimator__n_neighbors': 2,
  'estimator__p': 2,
  'estimator__weights': 'uniform',
  'estimator': KNeighborsClassifier(algorithm='ball_tree', n_neighbors=2),
  'n_jobs': -1},
 0.8929219830977282)

Best params for SVM:

In [187]:
params = {'criterion':["gini", "entropy"],"splitter":["best","random"],'max_features':['log2','auto']}
from sklearn.tree import DecisionTreeClassifier
TREE = Pipeline_multioutput(DecisionTreeClassifier)
TREE_params = TREE.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average')

In [188]:
TREE_params

({'estimator__ccp_alpha': 0.0,
  'estimator__class_weight': None,
  'estimator__criterion': 'gini',
  'estimator__max_depth': None,
  'estimator__max_features': 'log2',
  'estimator__max_leaf_nodes': None,
  'estimator__min_impurity_decrease': 0.0,
  'estimator__min_impurity_split': None,
  'estimator__min_samples_leaf': 1,
  'estimator__min_samples_split': 2,
  'estimator__min_weight_fraction_leaf': 0.0,
  'estimator__random_state': None,
  'estimator__splitter': 'best',
  'estimator': DecisionTreeClassifier(max_features='log2'),
  'n_jobs': -1},
 0.946743914066129)

In [129]:
from sklearn import svm
params = {'penalty':['l1', 'l2'],}
SVM = Pipeline_multioutput(LinearSVC)


ACCURACY  : 0.6862086776859504
F1 SCORE  : 0.889886564739331
RECALL    : 0.8997799941331769
PRECISION : 0.8802083333333334



(0.6862086776859504, 0.889886564739331, 0.8997799941331769, 0.8802083333333334)

In [114]:
KNN.classifier.get_params()

{'estimator__algorithm': 'ball_tree',
 'estimator__leaf_size': 30,
 'estimator__metric': 'minkowski',
 'estimator__metric_params': None,
 'estimator__n_jobs': None,
 'estimator__n_neighbors': 7,
 'estimator__p': 2,
 'estimator__weights': 'uniform',
 'estimator': KNeighborsClassifier(algorithm='ball_tree', n_neighbors=7),
 'n_jobs': -1}

In [202]:
params = dict(penalty=['l1','l2','elasticnet'],
             C = np.linspace(6,60,num=30),
             solver=['newton-cg','lbfgs','sag'])

logistic = Pipeline_multioutput(LogisticRegression)
logistic_params = logistic.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average')

In [204]:
logistic_params

({'estimator__C': 35.79310344827586,
  'estimator__class_weight': None,
  'estimator__dual': False,
  'estimator__fit_intercept': True,
  'estimator__intercept_scaling': 1,
  'estimator__l1_ratio': None,
  'estimator__max_iter': 100,
  'estimator__multi_class': 'auto',
  'estimator__n_jobs': None,
  'estimator__penalty': 'l2',
  'estimator__random_state': None,
  'estimator__solver': 'lbfgs',
  'estimator__tol': 0.0001,
  'estimator__verbose': 0,
  'estimator__warm_start': False,
  'estimator': LogisticRegression(C=35.79310344827586),
  'n_jobs': -1},
 0.5252080050459023)

In [200]:
logistic_params

({'estimator__C': 6.0,
  'estimator__class_weight': None,
  'estimator__dual': False,
  'estimator__fit_intercept': True,
  'estimator__intercept_scaling': 1,
  'estimator__l1_ratio': None,
  'estimator__max_iter': 100,
  'estimator__multi_class': 'auto',
  'estimator__n_jobs': None,
  'estimator__penalty': 'l2',
  'estimator__random_state': None,
  'estimator__solver': 'lbfgs',
  'estimator__tol': 0.0001,
  'estimator__verbose': 0,
  'estimator__warm_start': False,
  'estimator': LogisticRegression(C=6.0),
  'n_jobs': -1},
 0.5226206777048386)

In [207]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

FOREST = RandomForestClassifier()
params = {"n_estimators": scipy.stats.randint(50,200),
          'criterion':["gini", "entropy"],
          "max_depth":scipy.stats.randint(1,1000),
          'max_features':['log2','auto']}

clf = RandomizedSearchCV(FOREST, params)

search = clf.fit(X_train,y_train)

In [208]:
#######################################
########### NEURAL NETWORK ############
#######################################

NN = MLPClassifier(activation='logistic',solver='lbfgs',max_iter=10000)
NN.fit(X_train,y_train)

y_pred = NN.predict(X_test)
print('*'*10,'\nNEURAL NETWORK SCORES\n','*'*10)
ac = accuracy_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test,average='micro')
rec = recall_score(y_pred,y_test,average='micro')
prec = precision_score(y_pred,y_test,average='micro')

print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')

********** 
NEURAL NETWORK SCORES
 **********
ACCURACY  : 0.6893939393939394
F1 SCORE  : 0.9219001551652547
RECALL    : 0.9363540849697383
PRECISION : 0.9078856749311295



In [214]:
search.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(),
 'n_iter': 10,
 'n_jobs': None,
 'param_distributions': {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen at 0x7efc5e96ba60>,
  'criterion': ['gini', 'entropy'],
  'max_depth': <scipy.stats._distn_infrastructure.rv_frozen at 0x7efc5d259400>,
  'max_features': ['log2

In [213]:

y_pred = search.predict(X_test)
print('*'*10,'\nNEURAL NETWORK SCORES\n','*'*10)
ac = accuracy_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test,average='micro')
rec = recall_score(y_pred,y_test,average='micro')
prec = precision_score(y_pred,y_test,average='micro')

print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')

********** 
NEURAL NETWORK SCORES
 **********
ACCURACY  : 0.9566976584022039
F1 SCORE  : 0.9909480499903092
RECALL    : 0.9915530591429516
PRECISION : 0.9903437786960514

