# Trabalho 1 - Machine Learning II 
Prof: Carlos Padilha

#### Alunos:  

Roberto A. Coutinho  
Thais Galho


## Sistemas com Multi-classificadores ou Ensembles

#### Este trabalho visa avaliar o entendimento em relaçãao á construção de sistemas com multi-classificadores ou ensembles. Para tal, os alunos deverão fazer o seguinte:


* Implementar o algoritmo AdaBoost (nos mesmos moldes que fizemos com o algoritmo Bagging).
    – Podem escolher qualquer tipo de classificador (MLP, SVM, etc).
* Processar os dados presente no arquivo sonar.all-data.
* Realizar treinamento e teste usando validação cruzada com 10 folds.
* Avaliar os resultados em termos de acurácia, recall e precisão.

Obs: O trabalho pode ser feito em dupla e deve ser enviado por email (carlos.engcomp@gmail.com).

In [1]:
import numpy as np
import pandas as pd

# Modelos
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# K-fold CrossValidation
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [2]:
imported_data = pd.read_csv('sonar.all-data.csv', header=None)
imported_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [3]:
# Separação entre dados e labels

labels = imported_data.iloc[:,-1]
data = imported_data.iloc[:,:-1]
len(data), len(labels)

(208, 208)

In [4]:
from sklearn.metrics import confusion_matrix

def getAccuracy(testset, predictions):
    correct = 0
    for id_test, test in enumerate(testset):
        if test == predictions[id_test]:
            correct += 1
    return (correct / float(len(testset))) * 100.0

def printCM(Y_test, predictions):
    cm = confusion_matrix(Y_test, predictions)
    print ('Confusion Matrix : ')
    print (cm)
    print
    tn = float(cm[0][0])
    fp = float(cm[0][1])
    fn = float(cm[1][0])
    tp = float(cm[1][1])

    actual_yes = fn+tp
    actual_no = tn+fp
    predicted_yes = fp+tp
    predicted_no = tn+fn

    total = float(len(imported_data))
    print ('Total : '+ str(total))

    acc = getAccuracy(Y_test, predictions) /100
    print ('Acurácia : ' + str(acc))

    misclassification_rate = round((fp+fn)/total,3) # Overall, how often is it wrong?
    print ('Misclassification rate : ' +str(misclassification_rate))

    true_positive = round(tp/actual_yes,3) # When it's actually yes, how often does it predict yes?
    print ('True positives : ' +str(true_positive))

    false_positive = round(fp/actual_no,3) # When it's actually no, how often does it predict yes?
    print ('False positives : ' +str(false_positive))

    specificity = round(tn/actual_no,3) # When it's actually no, how often does it predict no?
    print ('Specificity : ' +str(specificity))

    precision = round(tp/predicted_yes,3) # When it predicts yes, how often is it correct?
    print ('Precision : ' +str(precision))

    prevalence = round(actual_yes/total,3) # How often does the yes condition actually occur in our sample?
    print ('Prevalence : ' +str(prevalence))
    
    recall = round(tp / (tp + fn), 3)
    print ('Recall : ' +str(recall))

    #f1 = round(2 * ((precision * true_positive) / (precision + true_positive)),3)
    #print ('F1 Score : ' +str(f1))
    
    return acc, precision, recall

<h2>Separação entre treino e teste</h2>

In [5]:

# utiliza 25% do dataset para teste
trainData, validationData, trainLabels, validationLabels = train_test_split(data, labels, 
                                                    train_size=0.90, 
                                                    test_size=0.10, 
                                                    stratify=labels)

print(len(trainData), len(trainLabels))
print(len(validationData), len(validationLabels))

187 187
21 21


In [25]:
scores = []



# Define quantos folds
kf = KFold(n_splits=10)

alpha = 0.01

print()
print("....Iniciando treinamento com 10 K-folds...." )
print()

kfold = 0    
for train_index, test_index in kf.split(trainData):
    
    print("################################################")
    print("K-fold : "+str(kfold+1))    
    print("################################################")
    #print(train_index, test_index)
    print()

    # Obten os subdados de treinamento e teste no n fold
    #---------------------------------------------------------------------
    X_train, X_test = trainData.iloc[train_index,:], trainData.iloc[test_index,:]
    #print(len(X_train), len(X_test))

    y_train, y_test = trainLabels.iloc[train_index], trainLabels.iloc[test_index]
    #print(len(y_train), len(y_test))
    
    print("....Inicializando vetor de pesos....")
    print()


    n_train, n_test = len(X_train), len(y_test)
    #pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]

    # Initialize weights
    w = np.ones(n_train) / n_train
    print(w)
        
    # Fit um classificador
    clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
    # Fit um classificador
    clf_tree2 = DecisionTreeClassifier(max_depth = 1, random_state = 1)

    models = [clf_tree, clf_tree2]

    for model in models:
        print()
        # Treina o modelo de classificação
        #---------------------------------------------------------------------
        print("Treinando o modelo....")

        # Treina o classificador com os pesos de treinamento
        model.fit(X_train, y_train, sample_weight=w)
        print(clf_tree)

        # Classifica o treino
        pred_train_i = clf_tree.predict(X_train)
        #print(pred_train_i)

        # Classifica o teste
        pred_test_i = clf_tree.predict(X_test)
        #print(pred_test_i)        

        print()
        print("...:::: Avaliação ::::....  ")
        print()

        # Obtem o index dos erros da classificação de treino e teste
        #---------------------------------------------------------------------
        miss = [int(x) for x in (pred_train_i != y_train)]
        #print("Training Miss : "+str(miss))
        missTest = [int(x) for x in (pred_test_i != y_test)]
        #print("Testing Miss : "+str(missTest))

        # Equivale os valores entre 1/-1 para atualização dos pesos
        #---------------------------------------------------------------------
        miss2 = [x if x==1 else -1 for x in miss]
        #print("Training Miss2 : "+str(miss2))
        miss2Test = [x if x==1 else -1 for x in missTest]
        #print("Testing Miss2 : "+str(miss2Test))


        # Calcula o erro
        #---------------------------------------------------------------------
        err_m = np.dot(w,miss) / sum(w)
        print("Error : "+str(err_m))

        # Calcula o Alpha 
        #---------------------------------------------------------------------
        alpha_m = alpha * np.log( (1 - err_m) / float(err_m))
        print("Alpha : "+str(alpha_m))


        # Mostra a Matriz de Confusão para treino e teste
        #---------------------------------------------------------------------
        print()
        print(":: Treinamento :: ")
        print("")
        train_acc_score, train_precision_score, train_recall_score = printCM(y_train, pred_train_i)

        print()
        print(":: Teste ::")
        print()
        test_acc_score, test_precision_score, test_recall_score = printCM(y_test, pred_test_i)
        print


        # Atualiza os valores dos pesos
        #---------------------------------------------------------------------
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
        print()
        print("Novos pesos atualizados : ")
        print(w)
        print()
        print("---------------------------------------------------------------------------")
        print()


        scores.append([kfold, train_acc_score, train_precision_score, train_recall_score, err_m, alpha_m, clf_tree])

    kfold += 1 
    print
    print


....Iniciando treinamento com 10 K-folds....

################################################
K-fold : 1
################################################

....Inicializando vetor de pesos....

[0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238 0.00595238
 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

...:::: Avaliação ::::....  

Error : 0.23076923076923042
Alpha : 0.012039728043259382

:: Treinamento :: 

Confusion Matrix : 
[[79 12]
 [27 51]]
Total : 208.0
Acurácia : 0.7692307692307694
Misclassification rate : 0.188
True positives : 0.654
False positives : 0.132
Specificity : 0.868
Precision : 0.81
Prevalence : 0.375
Recall : 0.654

:: Teste ::

Confusion Matrix : 
[[9 0]
 [5 4]]
Total : 208.0
Acurácia : 0.7222222222222221
Misclassification rate : 0.024
True positives : 0.444
False positives : 0.0
Specificity : 1.0
Precision : 1.0
Prevalence : 0.043
Recall : 0.444

Novos pesos atualizados : 
[0.00584635 0.00584635 0.00584635 0.00584635 0.005

In [23]:
display(scores)

[[0,
  0.7559523809523809,
  0.793,
  0.613,
  0.24404761904761962,
  0.011306150197542804,
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=1,
              splitter='best')],
 [0,
  0.7559523809523809,
  0.793,
  0.613,
  0.2482434308884095,
  0.011080027193591958,
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=1,
              splitter='best')],
 [1,
  0.7619047619047619,
  0.761,
  0.68,
  0.2380952380952387,
  0.011631508098056775,
  DecisionT

<h2>Apresentação dos Resultados</h2>

In [27]:
scoresDF = pd.DataFrame(scores, columns=["n-fold", 
                                         "Acc", 
                                         "Precision", 
                                         "Recall",
                                         "Error", 
                                         "Alpha", 
                                         "Model"])

scoresDF.sort_values(["Acc","Error"], ascending=False)

Unnamed: 0,n-fold,Acc,Precision,Recall,Error,Alpha,Model
5,2,0.797619,0.833,0.732,0.206845,0.01344,"DecisionTreeClassifier(class_weight=None, crit..."
4,2,0.797619,0.833,0.732,0.202381,0.013715,"DecisionTreeClassifier(class_weight=None, crit..."
13,6,0.791667,0.808,0.738,0.212771,0.013083,"DecisionTreeClassifier(class_weight=None, crit..."
12,6,0.791667,0.808,0.738,0.208333,0.01335,"DecisionTreeClassifier(class_weight=None, crit..."
11,5,0.785714,0.85,0.654,0.218693,0.012733,"DecisionTreeClassifier(class_weight=None, crit..."
10,5,0.785714,0.85,0.654,0.214286,0.012993,"DecisionTreeClassifier(class_weight=None, crit..."
15,7,0.775148,0.806,0.658,0.229196,0.012129,"DecisionTreeClassifier(class_weight=None, crit..."
14,7,0.775148,0.806,0.658,0.224852,0.012376,"DecisionTreeClassifier(class_weight=None, crit..."
17,8,0.769231,0.81,0.654,0.235071,0.011799,"DecisionTreeClassifier(class_weight=None, crit..."
16,8,0.769231,0.81,0.654,0.230769,0.01204,"DecisionTreeClassifier(class_weight=None, crit..."


In [57]:
bestModel = scoresDF.iloc[0, 6]

In [58]:
pred_validation = bestModel.predict(validationData)

In [59]:
printCM(validationLabels, pred_validation)

Confusion Matrix : 
[[19  9]
 [13 11]]

Total : 208.0
Acurácia : 0.576923076923
Misclassification rate : 0.106
True positives : 0.458
False positives : 0.321
Specificity : 0.679
Precision : 0.55
Prevalence : 0.115
Recall : 0.458


(0.5769230769230769, 0.55, 0.458)