####Módulos

In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

###Processo KDD - _Knowledge Discovery in Databases_

Composto pelas etapas de:

*   Seleção
*   Pré-Processamento
*   Transformação
*   Mineração de dados
*   Interpretação dos resultados



In [61]:
class GetData():
    def __init__(self):
        self.__df = None
        
    def select_data(self):
        path_1 = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \
               "wine-quality/winequality-red.csv"

        
        path_2 = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \
               "wine-quality/winequality-white.csv"
        
        df1 = pd.read_csv(path_1, delimiter=";")
        df1["Tipo"] = "Tinto"
        
        df2 = pd.read_csv(path_2, delimiter=";")
        df2["Tipo"] = "Branco"

        df = pd.merge(df1,df2,how="outer")

        dict = {'fixed acidity':'Acidez fixa',
                'volatile acidity':'Acidez volátil',
                'citric acid':'Ácido cítrico',
                'residual sugar':'Açúcar residual',
                'chlorides':'Cloretos',
                'free sulfur dioxide':'Dióxido de enxofre livre',
                'total sulfur dioxide':'Dióxido de enxofre total',
                'density':'Densidade',
                'sulphates':'Sulfatos',
                'alcohol':'Álcool',
                'quality':'Qualidade'}

        df.rename(columns=dict, inplace=True)

        self.__df = df
           


    def get_df(self):
        return self.__df

In [62]:
def apply_stratified_holdout(df: pd.DataFrame) -> tuple:
    
    features = ['Acidez fixa',
                'Acidez volátil',
                'Ácido cítrico',
                'Açúcar residual',
                'Cloretos',
                'Dióxido de enxofre livre',
                'Dióxido de enxofre total',
                'Densidade',
                'pH',
                'Sulfatos',
                'Álcool',
                'Qualidade']

    target = 'Tipo'
    
    X = df.drop(target, axis=1)
    Y = df[target]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                        test_size=0.3,
                                                        stratify=Y) 
                                                        
    
    train_test_set = x_train, x_test, y_train, y_test
        
    return train_test_set

data = GetData()
df = data.select_data()
df = data.get_df()

nn_architectures = [
                    
        {"max_iter" : 100,
         "hidden_layer_sizes" : (10),
         "solver" : "lbfgs",
         "activation" : "relu"},

        {"max_iter" : 300,
         "hidden_layer_sizes" : (10,10,10),
         "solver" : "adam",
         "activation" : "logistic"}

    ]   


def scaling(train, test):
       
    scaler = StandardScaler().fit(train)
    x_train_scaled = scaler.transform(train)
    x_test_scaled = scaler.transform(test)
    
    return x_train_scaled, x_test_scaled
    

def model_training(arch, x, y):
    
    mlp_clf = MLPClassifier(max_iter = arch["max_iter"], 
                            hidden_layer_sizes = arch["hidden_layer_sizes"],
                            solver = arch["solver"],
                            activation = arch["activation"])
    
    model = mlp_clf.fit(x, y)
    
    return model



def show_results():

    num_executions = 10
    bases = ["Base de treino", "Base de teste"]
    iterations = list(range(num_executions))

    architectures = ["Tipo 1", "Tipo 2"]
    measures_labels = ["Matriz de confusão",
                        "Sensibilidade",
                        "Especificidade",
                        "Confiabilidade positiva",
                        "Confiabilidade negativa",
                        "Acurácia total"
        ]

    rows_multi_index = pd.MultiIndex.from_product((bases, iterations), names=["Base", "Execução"])
    columns_multi_index = pd.MultiIndex.from_product((architectures, measures_labels), names=["Arquitetura", "Medida"])

    df_measures = pd.DataFrame(index=rows_multi_index, columns=columns_multi_index)
    
    for i in range(num_executions):
        for arch in nn_architectures:

            x_train, x_test, y_train, y_test = apply_stratified_holdout(df)
            x_train, x_test = scaling(x_train, x_test) 

            trained_model = model_training(arch, x_train, y_train)

            features_dict = {"Base de treino": x_train, 
                             "Base de teste": x_test}
            
            target_dict = {"Base de treino": y_train,
                           "Base de teste": y_test}

            for base in bases:
                y_pred = trained_model.predict(features_dict[base])
                
                confusion_matrix = metrics.confusion_matrix(target_dict[base], y_pred)
                true_negatives, false_positives, false_negatives, true_positives = confusion_matrix.ravel()
                
                sensitivity = true_positives / (true_positives + false_negatives)
                specificity = true_negatives / (true_negatives + false_positives)
                positive_predictive_value = true_positives / (true_positives + false_positives)
                negative_predictive_value = true_negatives / (true_negatives + false_negatives)
                accuracy = metrics.accuracy_score(target_dict[base], y_pred)
                
                measures = np.array([sensitivity, 
                                     specificity, 
                                     positive_predictive_value, 
                                     negative_predictive_value, 
                                     accuracy])
                
                df_measures.loc[(base, i), 
                               (f"Tipo {nn_architectures.index(arch) + 1}", 
                                "Matriz de confusão")] = str(np.matrix(confusion_matrix)).replace("\n", ",").replace("[ ", "[")
                
                df_measures.loc[(base, i), (f"Tipo {nn_architectures.index(arch) + 1}", measures_labels[1:])] = measures
                
            for label in measures_labels[1:]:
                df_measures.loc[:, (f"Tipo {nn_architectures.index(arch) + 1}", label)] = pd.to_numeric(
                    df_measures.loc[:, (f"Tipo {nn_architectures.index(arch) + 1}", label)]
                )
        
    return df_measures


results = show_results()      
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,Arquitetura,Tipo 1,Tipo 1,Tipo 1,Tipo 1,Tipo 1,Tipo 1,Tipo 2,Tipo 2,Tipo 2,Tipo 2,Tipo 2,Tipo 2
Unnamed: 0_level_1,Medida,Matriz de confusão,Sensibilidade,Especificidade,Confiabilidade positiva,Confiabilidade negativa,Acurácia total,Matriz de confusão,Sensibilidade,Especificidade,Confiabilidade positiva,Confiabilidade negativa,Acurácia total
Base,Execução,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Base de treino,0,"[[3427 1], [ 0 1119]]",1.0,0.999708,0.999107,1.0,0.99978,"[[3423 5], [ 11 1108]]",0.99017,0.998541,0.995508,0.996797,0.996481
Base de treino,1,"[[3426 2], [ 0 1119]]",1.0,0.999417,0.998216,1.0,0.99956,"[[3425 3], [ 12 1107]]",0.989276,0.999125,0.997297,0.996509,0.996701
Base de treino,2,"[[3426 2], [ 0 1119]]",1.0,0.999417,0.998216,1.0,0.99956,"[[3424 4], [ 13 1106]]",0.988382,0.998833,0.996396,0.996218,0.996261
Base de treino,3,"[[3427 1], [ 0 1119]]",1.0,0.999708,0.999107,1.0,0.99978,"[[3423 5], [ 13 1106]]",0.988382,0.998541,0.9955,0.996217,0.996041
Base de treino,4,"[[3426 2], [ 2 1117]]",0.998213,0.999417,0.998213,0.999417,0.99912,"[[3423 5], [ 14 1105]]",0.987489,0.998541,0.995495,0.995927,0.995821
Base de treino,5,"[[3427 1], [ 1 1118]]",0.999106,0.999708,0.999106,0.999708,0.99956,"[[3425 3], [ 11 1108]]",0.99017,0.999125,0.9973,0.996799,0.996921
Base de treino,6,"[[3428 0], [ 2 1117]]",0.998213,1.0,1.0,0.999417,0.99956,"[[3424 4], [ 12 1107]]",0.989276,0.998833,0.9964,0.996508,0.996481
Base de treino,7,"[[3428 0], [ 0 1119]]",1.0,1.0,1.0,1.0,1.0,"[[3424 4], [ 15 1104]]",0.986595,0.998833,0.99639,0.995638,0.995821
Base de treino,8,"[[3427 1], [ 0 1119]]",1.0,0.999708,0.999107,1.0,0.99978,"[[3426 2], [ 10 1109]]",0.991063,0.999417,0.9982,0.99709,0.997361
Base de treino,9,"[[3428 0], [ 1 1118]]",0.999106,1.0,1.0,0.999708,0.99978,"[[3425 3], [ 6 1113]]",0.994638,0.999125,0.997312,0.998251,0.998021


In [51]:
def get_average_std():
    rows = ["Base de treino", "Base de teste"]
    columns = ["Tipo 1", "Tipo 2"]

    accuracy_mean_1_train = results.loc[(rows[0]), ("Tipo 1", ["Acurácia total"])].mean().to_numpy()
    accuracy_std_1_train = results.loc[(rows[0]), ("Tipo 1", ["Acurácia total"])].std().to_numpy()

    accuracy_mean_1_test = results.loc[(rows[1]), ("Tipo 1", ["Acurácia total"])].mean().to_numpy()
    accuracy_std_1_test = results.loc[(rows[1]), ("Tipo 1", ["Acurácia total"])].std().to_numpy()

    accuracy_mean_2_train = results.loc[(rows[0]), ("Tipo 2", ["Acurácia total"])].mean().to_numpy()
    accuracy_std_2_train = results.loc[(rows[0]), ("Tipo 2", ["Acurácia total"])].std().to_numpy()

    accuracy_mean_2_test = results.loc[(rows[1]), ("Tipo 2", ["Acurácia total"])].mean().to_numpy()
    accuracy_std_2_test = results.loc[(rows[1]), ("Tipo 2", ["Acurácia total"])].std().to_numpy()

    cell_11 = f"{accuracy_mean_1_train * 100}% ± {accuracy_std_1_train * 100}%"
    cell_12 = f"{accuracy_mean_1_test * 100}% ± {accuracy_std_1_test * 100}%"
    cell_21 = f"{accuracy_mean_2_train* 100}% ± {accuracy_std_2_train * 100}%"
    cell_22 = f"{accuracy_mean_2_test* 100}% ± {accuracy_std_2_test * 100}%"

    acc_data = [[cell_11, cell_12],
               [cell_21, cell_22]]

    df_average_std = pd.DataFrame(acc_data,index=rows, columns=columns, dtype=np.object_)

    df_average_std.index.name = "Base"
    df_average_std.columns.name = "Arquitetura"

               
    return df_average_std

get_average_std()

Arquitetura,Tipo 1,Tipo 2
Base,Unnamed: 1_level_1,Unnamed: 2_level_1
Base de treino,[99.94721795]% ± [0.02781859]%,[99.47179487]% ± [0.16404914]%
Base de teste,[99.63932263]% ± [0.06150921]%,[99.56923077]% ± [0.11130818]%
