In [26]:
"""
    Nome - RGA:
    Fábio Holanda Saraiva Júnior - 2015.1905.006-2
    Felipe Salles Lopez - 2016.1907.032-4
    Lucas Avanzi - 2016.1907.024-3
    Lucas Antonio dos Santos - 2016.1907.013-8
"""

import pandas as pd
import numpy as np


from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn import model_selection

from sklearn import metrics

from sklearn.model_selection import GridSearchCV

#import pprint

import warnings

#evitar poluição visual
warnings.simplefilter('ignore')

dataset_path = [
                "./datasets/hayes-roth/hayes-roth.data",
                "./datasets/balance-scale/balance-scale.data",
                "./datasets/blood-transfusion/transfusion.data",
                "./datasets/wine/wine.data",
                "./datasets/glass/glass.data",
                "./datasets/haberman/haberman.data",
                "./datasets/zoo/zoo.CSV",
                "./datasets/iris/iris.data",
                "./datasets/lymphography/lymphography.data",
                "./datasets/tae/tae.data"
                ]

#indice da linha onde os exemplos são iniciados
#utilizado na leitura do dataset
first_class_line = [
    0,
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0
]

#coluna onde a classe é representada para cada dataset
#utilizado na separação da classe dos atributos
class_index = [
    5,
    0,
    4,
    0,
    10,
    3,
    17,
    4,
    0,
    5
]


#representa o tipo(numerico/string) da classe
type_of_class = [
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    1,
    0,
    0  
]





In [27]:
# Recebe um array contendo todo o dataset
# Retorna dois arrays, um contendo somente os atributos das instancias, o outro contendo as classes das instancias
def separate_class_of_parameters(np_array, index_of_class):
    # onde a classe é a primeira coluna no dataset
    if index_of_class == 0:
        return np_array[:, 1:], np_array[:, 0]
        
    #onde a classe é a ultima coluna do dataset
    else:
        return np_array[:, 0:index_of_class], np_array[:, index_of_class]




In [36]:
def classification(i):
    #Verificar inicio do dataset
    #considerar primeira linha
    if first_class_line[i] == 0:
        pd_dataset = pd.read_csv(dataset_path[i], header = None)
    #desconsiderar primeira linha    
    else:
        pd_dataset = pd.read_csv(dataset_path[i])
    
    
    array = np.array(pd_dataset)

    #separação de atributos e classes
    data, target = separate_class_of_parameters(array, class_index[i])

    #print(data)
    #print(target)
    

    inner_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=None) #treinamento dos algoritmos
    outer_kf = model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=None) #ajuste dos parâmetros



    print("====== Started Decision Tree parameters tuning")

    decision_tree = tree.DecisionTreeClassifier()
    param_dist = {'max_depth':[3,4,5,6,7,8,9,10]}
    grid_search = GridSearchCV(decision_tree, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionTreeBestParams = grid_search.best_params_
    print("Performance:\n %s \n\n" % grid_search.cv_results_)
    print("Decision Tree: %s \n\n" % decisionTreeBestParams)
   


    print("====== Started KNN parameters tuning")

    knn = KNeighborsClassifier()
    param_dist = {'n_neighbors': list(np.arange(1, 15)), 'metric':['euclidean'], 'weights':['uniform', 'distance']} #Parâmetros testados
    grid_search = GridSearchCV(knn, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    knnBestParams = grid_search.best_params_
    print("Performance:\n %s \n\n" %grid_search.cv_results_)
    print("Decision KNN: %s \n\n" % knnBestParams)




    print("====== Started Naive Bayes parameters tuning")

    naive_bayes = GaussianNB()
    param_dist = {'var_smoothing': [1e-3, 1e-4, 1e-6, 1e-7, 1e-8,1e-9, 1e-10]}
    grid_search = GridSearchCV(naive_bayes, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionNaiveBayesBestParams = grid_search.best_params_
    print("Performance:\n %s \n\n" %grid_search.cv_results_)
    print("Decision Naive Bayes: %s \n\n" % decisionNaiveBayesBestParams)



    print("====== Started Logistic Regression parameters tuning")

    reg_log = LogisticRegression()
    param_dist = {'random_state':[0], 'tol' : [1e-3, 1e-4,1e-5, 1e-6]}
    grid_search = GridSearchCV(reg_log, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=False)
    grid_search.fit(data, target)
    decisionLogisticRegressionBestParams = grid_search.best_params_
    print("Performance:\n %s \n\n" %grid_search.cv_results_)
    print("Decision Logistic Regression: %s \n\n" % decisionLogisticRegressionBestParams)



    print("====== Started MLP parameters tuning")

    mlp = MLPClassifier()
    param_dist = {'hidden_layer_sizes' : [(5,),(10,),(20,),(30,),(40,)], 'activation' : ["logistic"], 
                  'max_iter' : [200], 'alpha' : [0.01,0.001], 'solver' : ["sgd"], 
                  'tol' : [1e-7, 1e-8, 1e-9], 'learning_rate_init' : [0.01, 0.001], 'verbose' : [False]} 
    grid_search = GridSearchCV(mlp, param_grid=param_dist, cv=outer_kf, scoring='accuracy', refit=True)
    grid_search.fit(data, target)
    mlpBestParams = grid_search.best_params_
    print("Performance:\n %s \n\n" %grid_search.cv_results_)
    print("Decision MLP: %s \n\n" % mlpBestParams)



    decision_tree = tree.DecisionTreeClassifier(**decisionTreeBestParams)
    knn = KNeighborsClassifier(**knnBestParams)
    naive_bayes = GaussianNB(**decisionNaiveBayesBestParams)
    reg_log = LogisticRegression(**decisionLogisticRegressionBestParams)
    mlp = MLPClassifier(**mlpBestParams)

    predicted_classes = dict()
    #dicionario para classe numerica
    if type_of_class[i] == 0:
        predicted_classes['tree'] = np.zeros(len(data))
        predicted_classes['knn'] = np.zeros(len(data))
        predicted_classes['naive'] = np.zeros(len(data))
        predicted_classes['reg_log'] = np.zeros(len(data)) 
        predicted_classes['mlp'] = np.zeros(len(data))
    #dicionario para classe do tipo string
    else:
        predicted_classes['tree'] = np.empty(len(data), dtype = object)
        predicted_classes['knn'] = np.empty(len(data), dtype =object)
        predicted_classes['naive'] = np.empty(len(data), dtype = object)
        predicted_classes['reg_log'] = np.empty(len(data), dtype = object) 
        predicted_classes['mlp'] = np.empty(len(data), dtype = object)

    for train,test in inner_kf.split(data, target): # TEST: 10, 20, 30, 40 ...
        data_train, target_train = data[train], target[train]
        data_test, target_test = data[test], target[test]

        decision_tree = decision_tree.fit(data_train, target_train)   
        decision_tree_predicted = decision_tree.predict(data_test)
        predicted_classes['tree'][test] = decision_tree_predicted

        knn = knn.fit(data_train, target_train)
        knn_predicted = knn.predict(data_test)
        predicted_classes['knn'][test] = knn_predicted

        naive_bayes = naive_bayes.fit(data_train, target_train)
        naive_predicted = naive_bayes.predict(data_test)
        predicted_classes['naive'][test] = naive_predicted

        reg_log = reg_log.fit(data_train, target_train)
        reg_log_predicted = reg_log.predict(data_test)
        predicted_classes['reg_log'][test] = reg_log_predicted

        mlp = mlp.fit (data_train, target_train)
        mlp_predicted = mlp.predict(data_test)
        predicted_classes['mlp'][test] = mlp_predicted

    for classifier in predicted_classes.keys():
        print("======================================================================")
        print("Resultados do classificador %s\n%s\n" 
              %(classifier, metrics.classification_report(target, predicted_classes[classifier])))
        print("Matriz de confusão: \n%s\n\n\n" % metrics.confusion_matrix(target, predicted_classes[classifier]))
        
        
    #Calcular desvio padrao
    
    

In [42]:
"""
Selecionar um numero de >= 0 e <=9 que representa qual dataset 
será classificado
0-Hayes-Roth & Hayes-Roth (1977) Database


1-Balance Scale Weight & Distance Database


2-Blood Transfusion Service Center Data Set
Citation Request:

NOTE: Reuse of this database is unlimited with retention of copyright notice for 
Prof. I-Cheng Yeh and the following published paper:

Yeh, I-Cheng, Yang, King-Jang, and Ting, Tao-Ming, "Knowledge discovery on RFM 
model using Bernoulli sequence, "Expert Systems with Applications, 2008 
(doi:10.1016/j.eswa.2008.07.018).


3-Wine recognition data


4-Glass Identification Database


5-Haberman's Survival Data


6-Zoo database


7-Iris Plants Database


8-Lymphography Domain
Citation Request:
   This lymphography domain was obtained from the University Medical Centre,
   Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
   M. Soklic for providing the data.  Please include this citation if you plan
   to use this database.



9-Teaching Assistant Evaluation


"""


classification(9)



"""
Pode-se tambem classificar todos os datasets descomentando as linhas a seguir
"""
#for dataset_index in range(10):
#    classification(dataset_index)


Performance:
 {'mean_fit_time': array([0.00087849, 0.00061226, 0.00070469, 0.00058619, 0.00069825,
       0.00108933, 0.00077184, 0.00100032]), 'std_fit_time': array([6.93521721e-05, 3.81206382e-05, 1.93114274e-04, 5.16535684e-05,
       9.58878174e-05, 1.56845845e-04, 1.30942287e-04, 3.17468123e-04]), 'mean_score_time': array([0.00059374, 0.00040428, 0.00044258, 0.00035079, 0.00049877,
       0.00064786, 0.00052826, 0.00053692]), 'std_score_time': array([3.95575311e-05, 2.99670646e-05, 1.26519395e-04, 1.16562602e-05,
       1.18094477e-04, 1.28952693e-04, 1.59774883e-04, 1.56919398e-04]), 'param_max_depth': masked_array(data=[3, 4, 5, 6, 7, 8, 9, 10],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 3}, {'max_depth': 4}, {'max_depth': 5}, {'max_depth': 6}, {'max_depth': 7}, {'max_depth': 8}, {'max_depth': 9}, {'max_depth': 10}], 'split0_test_score': array([0.52941176, 0.41176471, 0.47

Performance:
 {'mean_fit_time': array([0.03907379, 0.04323117, 0.0441637 , 0.04565485]), 'std_fit_time': array([0.00046674, 0.00417591, 0.0010828 , 0.002063  ]), 'mean_score_time': array([0.00038632, 0.00066312, 0.0005881 , 0.00042176]), 'std_score_time': array([5.39947756e-06, 2.17955908e-04, 1.53890200e-04, 5.38386247e-05]), 'param_random_state': masked_array(data=[0, 0, 0, 0],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_tol': masked_array(data=[0.001, 0.0001, 1e-05, 1e-06],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'random_state': 0, 'tol': 0.001}, {'random_state': 0, 'tol': 0.0001}, {'random_state': 0, 'tol': 1e-05}, {'random_state': 0, 'tol': 1e-06}], 'split0_test_score': array([0.52941176, 0.52941176, 0.52941176, 0.52941176]), 'split1_test_score': array([0.52, 0.52, 0.52, 0.52]), 'split2_test_score': array([0.44, 0.44, 0.44, 0.44]), 'mean_test_score': a

Resultados do classificador tree
              precision    recall  f1-score   support

           1       0.65      0.69      0.67        49
           2       0.59      0.58      0.59        50
           3       0.62      0.60      0.61        52

    accuracy                           0.62       151
   macro avg       0.62      0.62      0.62       151
weighted avg       0.62      0.62      0.62       151


Matriz de confusão: 
[[34  7  8]
 [10 29 11]
 [ 8 13 31]]



Resultados do classificador knn
              precision    recall  f1-score   support

           1       0.66      0.59      0.62        49
           2       0.59      0.54      0.56        50
           3       0.56      0.65      0.60        52

    accuracy                           0.60       151
   macro avg       0.60      0.60      0.60       151
weighted avg       0.60      0.60      0.60       151


Matriz de confusão: 
[[29  8 12]
 [ 8 27 15]
 [ 7 11 34]]



Resultados do classificador naive
              p

'\nPode-se tambem classificar todos os datasets descomentando as linhas a seguir\n'