In [57]:
import os
from os import path
import pandas as pd
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from itertools import product
import numpy as np
warnings.filterwarnings('ignore')


import scipy.stats
from sklearn.preprocessing import  OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [36]:
class Pipeline_multioutput():
    """
    Class which implements the usage of a single pipeline with 
    one estimator on scikit-learn's MultioutputClassifier.
    """
    
    def __init__(self, estimator):
        """
        Params:
            estimator(sklearn estimator): Estimator on which the pipeline will be
                                done
        """
        self.estimator = estimator
        self.fit_done = False
    
    def set_params(self,params,njobs=-1):
        
        """
        Function which will set the class' estimator with the multioutput classifier.
        ----------------------------------------------------------
        Params:
            params (dict): Parameters for the class' estimator.
            njobs (int): The number of jobs to run in parallel.
        """
        self.params = params
        clf = self.estimator(**self.params)
        m_clf = MultiOutputClassifier(clf)
        self.classifier = m_clf
    
    def fit(self, X,y):
        """
        Function which will fit the estimator with the multioutput classifier
        in which X is the input and y is the desired output.
        If no parameters had been set before (by using self.set_params)
        the default parameters for the estimator will be set.
        ----------------------------------------------------------
        Params:
            X (array): Input Data.
            Y (array): Target value.
            
        """
        try:
            self.classifier
        except Exception:
            try:
                clf = self.estimator()
                m_clf = MultiOutputClassifier(clf)
                self.classifier = m_clf
            except Exception:
                raise Exception('Parameters for {} must be set before'.format(self.estimator))
        
        self.classifier.fit(X_train,y_train)
        self.fit_done = True
        
    def predict(self,X):
        """
        Predict multi-output variable using a model trained.
        ------------------------------------------------------
        Params:
            X(array): Input Data.
        Returns:
            (array): Predicted values.
        """
        if not self.fit_done:
            raise Exception('The model has not been fitted.')
        return self.classifier.predict(X)
    
    def score(self,X,y,verbose=False):
        """
        Scoring for the model.
        -------------------------------
        Params:
            X (array): Input Data.
            Y (array): Target value.
            verbose(bool): If set True, each score will be shown.
        Returns:
            scores(tuple): Accuracy, F1-score, Recall and Precision
        """
        y_pred = self.predict(X)
        ac = accuracy_score(y_pred,y)
        f1 = f1_score(y_pred,y,average='micro')
        rec = recall_score(y_pred,y,average='micro')
        prec = precision_score(y_pred,y,average='micro')
        if verbose:
            print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
        
        scores = {'accuracy':ac,'f1':f1,'recall':rec,'precision':prec,'average':sum([ac,f1,rec,prec])/4}
        return scores
    
    def get_params(self):
        return self.classifier.get_params()
    
    def GridSearch(self,params,X_train,y_train,X_test,y_test,scoring='accuracy',njobs=-1,verbose=False):
        """
        Gridsearch on specified parameter space.
        -----------------------------------------
        Params:
            params(dict): Dictionary ({param:values}) where vaules is a list containing the parameters on
                         which the gridsearch will be done.
            X_train(array): Input Data for training.
            y_train(array): Target values for training.
            X_test(array): Input Data for testing.
            y_test(array): Target values for testing.
            scoring(str): Scoring method for evaluating the model. Can be choosen between: 'accuracy',
                         'f1', 'recall', 'precision' and 'average'. (Default = accuracy)
            njobs (int): The number of jobs to run in parallel.(Default=-1)
        """
        best_score = 0
        params_comb = product(*params.values())
        for values in params_comb:
            try:
                param_dict = {param:value for param,value in zip(params.keys(),values)}

                self.set_params(param_dict,njobs)
                self.fit(X_train,y_train)
                score = self.score(X_test,y_test)[scoring]
                if verbose:
                    print(param_dict)
                    print(f'{scoring} Score:{score}')
                if score > best_score:
                    best_params = self.get_params()
                    best_score = score
                    clf = self.classifier
                    
            except Exception:
                continue
            
        return clf, best_params, best_score

In [58]:
def model_selection(X_train,y_train,X_test,y_test):
    """
    
    """
 
    #########################################
    ########### KNN CLASSIFIER ##############
    #########################################
    params = {'n_neighbors':[2,3,4,5,6,7,8,9,10], 'algorithm':['ball_tree','kd_tree'], 
              'metric':['hamming','minkowski']}
    KNN = Pipeline_multioutput(KNeighborsClassifier)
    KNN_params = KNN.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average',verbose=0)
    KNN_clf = KNN_params[0].fit(X_train,y_train)
    
    y_pred = KNN_clf.predict(X_test)
    print('*'*10,'\nKNN SCORES\n','*'*10)
    ac = accuracy_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test,average='micro')
    rec = recall_score(y_pred,y_test,average='micro')
    prec = precision_score(y_pred,y_test,average='micro')
    
    print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
    KNN_score = (ac+f1+rec+prec)/4
    
    #########################################
    ############# DECISION TREE #############
    #########################################
    params = {'criterion':["gini", "entropy"],"splitter":["best","random"],
              'max_features':['log2','auto']}
    TREE = Pipeline_multioutput(DecisionTreeClassifier)
    TREE_params = TREE.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average')
    TREE_clf = TREE_params[0].fit(X_train,y_train)
    
    y_pred = TREE_clf.predict(X_test)
    print('*'*10,'\nDECISIOM TREE SCORES\n','*'*10)
    ac = accuracy_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test,average='micro')
    rec = recall_score(y_pred,y_test,average='micro')
    prec = precision_score(y_pred,y_test,average='micro')

    print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
    TREE_score = (ac+f1+rec+prec)/4
    
    #########################################
    ########## LOGISTIC REGRESSION ##########
    #########################################
    params = {"penalty":['l1','l2','elasticnet'],"C":np.linspace(6,60,num=30),
                 "solver":['newton-cg','lbfgs','sag']}
    LOGISTIC = Pipeline_multioutput(LogisticRegression)
    LOGISTIC_params = LOGISTIC.GridSearch(params,X_train,y_train,X_test,y_test,scoring='average')
    LOGISTIC_clf = LOGISTIC_params[0].fit(X_train,y_train)
    
    y_pred = LOGISTIC_clf.predict(X_test)
    print('*'*10,'\nLOGISTIC REGRESSION SCORES\n','*'*10)
    ac = accuracy_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test,average='micro')
    rec = recall_score(y_pred,y_test,average='micro')
    prec = precision_score(y_pred,y_test,average='micro')

    print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
    LOGISTIC_score = (ac+f1+rec+prec)/4
    
    #///USING RANDOMIZED GRIDSEARCH
    
    #########################################
    ############# RANDOM FOREST #############
    #########################################   
    params = {"n_estimators": scipy.stats.randint(50,200),
          'criterion':["gini", "entropy"],
          "max_depth":scipy.stats.randint(1,1000),
          'max_features':['log2','auto']}
    FOREST = RandomForestClassifier()
    FOREST_params = RandomizedSearchCV(FOREST, params)
    search = FOREST_params.fit(X_train,y_train)
    
    y_pred = search.predict(X_test)
    print('*'*10,'\nRANDOM FOREST SCORES\n','*'*10)
    ac = accuracy_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test,average='micro')
    rec = recall_score(y_pred,y_test,average='micro')
    prec = precision_score(y_pred,y_test,average='micro')

    print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
    FOREST_score = (ac+f1+rec+prec)/4
    
    
    #######################################
    ########### NEURAL NETWORK ############
    #######################################
    NN = MLPClassifier(activation='logistic',solver='lbfgs',max_iter=10000)
    NN.fit(X_train,y_train)

    y_pred = NN.predict(X_test)
    print('*'*10,'\nNEURAL NETWORK SCORES\n','*'*10)
    ac = accuracy_score(y_pred,y_test)
    f1 = f1_score(y_pred,y_test,average='micro')
    rec = recall_score(y_pred,y_test,average='micro')
    prec = precision_score(y_pred,y_test,average='micro')

    print(f'ACCURACY  : {ac}\nF1 SCORE  : {f1}\nRECALL    : {rec}\nPRECISION : {prec}\n')
    NN_score = (ac+f1+rec+prec)/4
    
    best_score = max([KNN_score,TREE_score,LOGISTIC_score,FOREST_score,NN_score])
    
    if best_score == KNN_score:
        return KNN_clf
    if best_score == TREE_score:
        return TREE_clf
    if best_score == LOGISTIC_score:
        return LOGISTIC_clf
    if best_score == FOREST_score:
        return FOREST_clf
    if best_score == NN_score:
        return NN_clf

In [59]:
#from sklearn import svm
#params = {'penalty':['l1', 'l2'],}
#SVM = Pipeline_multioutput(LinearSVC)

In [60]:
####################
#### HALF ##########
####################

CLN_HLT_ALL_50_qnt = pd.read_csv("PAN/ALL/CLN_HLT_ALL_50_qnt.csv",index_col=0)
FEATS = list(CLN_HLT_ALL_50_qnt.columns[:8])
LABELS = list(CLN_HLT_ALL_50_qnt.columns[8:])

In [61]:
ohe = OneHotEncoder()
ohe_vector = ohe.fit_transform(CLN_HLT_ALL_50_qnt[LABELS]).toarray()

X_train, X_test, y_train, y_test = train_test_split(CLN_HLT_ALL_50_qnt[FEATS], ohe_vector)

In [None]:
model_selection(X_train,y_train,X_test,y_test)

********** 
KNN SCORES
 **********
ACCURACY  : 0.7727272727272727
F1 SCORE  : 0.9318665507628667
RECALL    : 0.9753404028361674
PRECISION : 0.8921028466483012

********** 
DECISIOM TREE SCORES
 **********
ACCURACY  : 0.8398760330578512
F1 SCORE  : 0.9812885636389725
RECALL    : 0.9813730555077206
PRECISION : 0.9812040863177227

********** 
LOGISTIC REGRESSION SCORES
 **********
ACCURACY  : 0.16606404958677687
F1 SCORE  : 0.6411724334187047
RECALL    : 0.76941464786469
PRECISION : 0.5495724288337924

********** 
RANDOM FOREST SCORES
 **********
ACCURACY  : 0.9568698347107438
F1 SCORE  : 0.991234933921021
RECALL    : 0.9918686339017627
PRECISION : 0.9906020431588614

