In [27]:
"""
    Nome - RGA:
    Fábio Holanda Saraiva Júnior - 2015.1905.006-2
    Felipe Salles Lopez - 2016.1907.032-4
    Lucas Avanzi - 2016.1907.024-3
    Lucas Antonio dos Santos - 2016.1907.013-8
"""

import pandas as pd
import numpy as np


from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn import model_selection

from sklearn import metrics

from sklearn.model_selection import GridSearchCV

import warnings

#evitar poluição visual
warnings.simplefilter('ignore')

dataset_path = [
                "./datasets/hayes-roth/hayes-roth.data",
                "./datasets/balance-scale/balance-scale.data",
                "./datasets/blood-transfusion/transfusion.data",
                "./datasets/wine/wine.data",
                "./datasets/glass/glass.data",
                "./datasets/haberman/haberman.data",
                "./datasets/zoo/zoo.CSV",
                "./datasets/iris/iris.data",
                "./datasets/lymphography/lymphography.data",
                "./datasets/tae/tae.data"
                ]

#indice da linha onde os exemplos são iniciados
#utilizado na leitura do dataset
first_class_line = [
    0,
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0
]

#coluna onde a classe é representada para cada dataset
#utilizado na separação da classe do atributo
class_index = [
    5,
    0,
    4,
    0,
    10,
    3,
    17,
    4,
    0,
    5
]


#representa o tipo(numerico/string) da classe
type_of_class = [
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    1,
    0,
    0  
]





In [28]:
# Recebe um array contendo todo o dataset
# Retorna dois arrays, um contendo somente os atributos das instancias, o outro contendo as classes das instancias
def separate_class_of_parameters(np_array, index_of_class):
    # onde a classe é a primeira coluna no dataset
    if index_of_class == 0:
        return np_array[:, 1:], np_array[:, 0]
        
    #onde a classe é a ultima coluna do dataset
    else:
        return np_array[:, 0:index_of_class], np_array[:, index_of_class]




In [33]:
def classification(i):
    #Verificar inicio do dataset
    
    if first_class_line[i] == 0:
        pd_dataset = pd.read_csv(dataset_path[i], header = None)
        
    else:
        pd_dataset = pd.read_csv(dataset_path[i])
    
    
    array = np.array(pd_dataset)


    data, target = separate_class_of_parameters(array, class_index[i])

    #print(data)
    #print(target)
    
    

    inner_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=None) #treinamento dos algoritmos
    outer_kf = model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=None) #ajuste dos parâmetros



    print("====== Started Decision Tree parameters tuning")

    decision_tree = tree.DecisionTreeClassifier()
    param_dist = {'max_depth':[3,4,5,6,7,8,9,10]}
    grid_search = GridSearchCV(decision_tree, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionTreeBestParams = grid_search.best_params_
    print("Decision Tree: %s \n\n" % decisionTreeBestParams)


    print("====== Started KNN parameters tuning")

    knn = KNeighborsClassifier()
    param_dist = {'n_neighbors': list(np.arange(1, 15)), 'metric':['euclidean'], 'weights':['uniform', 'distance']} #Parâmetros testados
    grid_search = GridSearchCV(knn, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    knnBestParams = grid_search.best_params_
    print("KNN: %s \n\n" % knnBestParams)




    print("====== Started Naive Bayes parameters tuning")

    naive_bayes = GaussianNB()
    param_dist = {'var_smoothing': [1e-6, 1e-7, 1e-8,1e-9, 1e-10]}
    grid_search = GridSearchCV(naive_bayes, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionNaiveBayesBestParams = grid_search.best_params_
    print("Decision Naive Bayes: %s \n\n" % decisionNaiveBayesBestParams)



    print("====== Started Logistic Regression parameters tuning")

    reg_log = LogisticRegression(random_state=0)
    param_dist = {'random_state':[0], 'tol' : [1e-4,1e-5, 1e-5, 1e-6]}
    grid_search = GridSearchCV(reg_log, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionLogisticRegressionBestParams = grid_search.best_params_
    print("Decision Logistic Regression: %s \n\n" % decisionLogisticRegressionBestParams)



    print("====== Started MLP parameters tuning")

    mlp = MLPClassifier()
    param_dist = {'hidden_layer_sizes' : [(12,),(20,),(30,)], 'activation' : ["logistic"], 'max_iter' : [100], 
                  'alpha' : [0.01], 'solver' : ["sgd"], 'tol' : [1e-9], 'learning_rate_init' : [.01], 'verbose' : [False]} 
    grid_search = GridSearchCV(mlp, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    mlpBestParams = grid_search.best_params_
    print("Decision MLP: %s \n\n" % mlpBestParams)



    #Pala descrição adicionar:
    #Deve-se reportar o desempenho de cada algoritmo com cada combinação de parâmetro


    decision_tree = tree.DecisionTreeClassifier(**decisionTreeBestParams)
    knn = KNeighborsClassifier(**knnBestParams)
    naive_bayes = GaussianNB(**decisionNaiveBayesBestParams)
    reg_log = LogisticRegression(**decisionLogisticRegressionBestParams)
    mlp = MLPClassifier(**mlpBestParams)

    predicted_classes = dict()
    if type_of_class[i] == 0:
        predicted_classes['tree'] = np.zeros(len(data))
        predicted_classes['knn'] = np.zeros(len(data))
        predicted_classes['naive'] = np.zeros(len(data))
        predicted_classes['reg_log'] = np.zeros(len(data)) 
        predicted_classes['mlp'] = np.zeros(len(data))

    else:
        predicted_classes['tree'] = np.empty(len(data), dtype = object)
        predicted_classes['knn'] = np.empty(len(data), dtype =object)
        predicted_classes['naive'] = np.empty(len(data), dtype = object)
        predicted_classes['reg_log'] = np.empty(len(data), dtype = object) 
        predicted_classes['mlp'] = np.empty(len(data), dtype = object)

    for train,test in inner_kf.split(data, target): # TEST: 10, 20, 30, 40 ...
        data_train, target_train = data[train], target[train]
        data_test, target_test = data[test], target[test]

        decision_tree = decision_tree.fit(data_train, target_train)   
        decision_tree_predicted = decision_tree.predict(data_test)
        predicted_classes['tree'][test] = decision_tree_predicted

        knn = knn.fit(data_train, target_train)
        knn_predicted = knn.predict(data_test)
        predicted_classes['knn'][test] = knn_predicted

        naive_bayes = naive_bayes.fit(data_train, target_train)
        naive_predicted = naive_bayes.predict(data_test)
        predicted_classes['naive'][test] = naive_predicted

        reg_log = reg_log.fit(data_train, target_train)
        reg_log_predicted = reg_log.predict(data_test)
        predicted_classes['reg_log'][test] = reg_log_predicted

        mlp = mlp.fit (data_train, target_train)
        mlp_predicted = mlp.predict(data_test)
        predicted_classes['mlp'][test] = mlp_predicted

    for classifier in predicted_classes.keys():
        print("======================================================================")
        print("Resultados do classificador %s\n%s\n" 
              %(classifier, metrics.classification_report(target, predicted_classes[classifier])))
        print("Matriz de confusão: \n%s\n\n\n" % metrics.confusion_matrix(target, predicted_classes[classifier]))

In [43]:
classification(9)


Decision Tree: {'max_depth': 8} 


KNN: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'} 


Decision Naive Bayes: {'var_smoothing': 1e-06} 


Decision Logistic Regression: {'random_state': 0, 'tol': 0.0001} 


Decision MLP: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (30,), 'learning_rate_init': 0.01, 'max_iter': 100, 'solver': 'sgd', 'tol': 1e-09, 'verbose': False} 


Resultados do classificador tree
              precision    recall  f1-score   support

           1       0.50      0.55      0.52        49
           2       0.58      0.58      0.58        50
           3       0.70      0.63      0.67        52

    accuracy                           0.59       151
   macro avg       0.59      0.59      0.59       151
weighted avg       0.60      0.59      0.59       151


Matriz de confusão: 
[[27 13  9]
 [16 29  5]
 [11  8 33]]



Resultados do classificador knn
              precision    recall  f1-score   support

           1       0.69     