# Prep

## Imports

In [1]:
# Imports
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier; # Tree



# WARNING: This api is different from the SKLEARN 
import statsmodels.api as sm; # GLM


# From Author
from RBF import RBFClassifier
from sklearn.linear_model import LogisticRegression
# Auxiliar
from collections import namedtuple


## Parameters

In [58]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = ["KIDSDRIV", "BIRTH", 
                            "AGE", "HOMEKIDS", 
                            "YOJ", "INCOME", 
                            "PARENT1", "HOME_VAL", 
                            "MSTATUS", "GENDER", 
                            "EDUCATION", "OCCUPATION", 
                            "TRAVTIME", "CAR_USE", 
                            "BLUEBOOK", "TIF", 
                            "CAR_TYPE", "RED_CAR", 
                            "OLDCLAIM",  "REVOKED", 
                            "MVR_PTS", "CAR_AGE"]

dependent_variable_flag = ["CLAIM_FLAG"]
dependent_variable_frequency = ["CLM_FREQ"]
dependent_variable_value = ["CLM_AMT"]

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

## Auxiliar Functions

In [220]:
def score_results(y_real, y_predito, label, verbose = False):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    try:
        tn, fp, fn, tp = matriz_de_confusao.ravel()
    except ValueError:
        print("ValueError: Imprimindo matriz de confusão.")
        print(matriz_de_confusao)
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall", "f1", "confusion_matrix"])
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito)
    precision = tp/(tp+fp)
    recall    = tp/(tp+fn)
    f1 = 2*(precision * recall) / (precision + recall)
    output = Resultados(balanced_accuracy = balanced_accuracy,
                       precision = precision,
                       recall    = recall,
                       f1 =  f1,
                        
                       confusion_matrix = matriz_de_confusao)

    if verbose:
        print(f"--- {label} ---")
        print("Matrix de Confusão")
        print(matriz_de_confusao)
        print("Acurácia Balanceada: ", end=" ")
        print(f"{100*output.balanced_accuracy :.2f} % ")
        print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
        print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
        print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

        print("-"*80)
    return output


In [243]:
def imprime_matriz_de_confusao_media(test_results):
    tns, tps, fps, fns = list(), list(), list(), list()
    for index, result in enumerate(test_results):
        confusion_matrix = result.confusion_matrix
        #print(f"Fit {index}")
        #print(confusion_matrix)
        tnl, fpl, fnl, tpl = confusion_matrix.ravel()
        tns.append(tnl)
        fps.append(fpl)
        fns.append(fnl)
        tps.append(tpl)

    tn = np.sum(tns)
    tp = np.sum(tps)
    fp = np.sum(fps)
    fn = np.sum(fns)
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall) / (precision+recall)
    print("Médias")
    print(f"Total Real Positives: {tp+fn :.2f}\nTotal Real Negatives: {tn+fp:.2f}\n" +
          f"Total Predicted Positives: {tp+fp:.2f}\nTotal Predicted Negatives: {tn+fn:.2f}\n" +
          "\n"
          f"True Positives: {tp:.2f}\nTrue Negatives: {tn:.2f}\nFalse Positives: {fp:.2f}\nFalse Negatives: {fn:.2f}")
    print("")
    print(f"Precision: { precision :.2f}\nRecall: { recall :.2f}")
    print(f"Accuracy Positive(Recall): {tp/(tp+fn):.2f}\nAccuracy Negative: {tn/(tn+fp):.2f}")
    print(f"Balanced Accuracy: {((tp/(tp+fn)) + (tn/(tn+fp)))/2 :.2f}")
    print(f"F1 Score: {f1 : .2f}")
    print("")
 



## Loading Dataset

In [195]:
data = pd.read_csv(data_under_analysis)

In [196]:
# Linhas / Colunas
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
print(pd.DataFrame(data.columns))
print(data.describe())
print(data.describe().shape[1]) # Colunas Númericas


10302 linhas e 27 colunas
             0
0           ID
1     KIDSDRIV
2        BIRTH
3          AGE
4     HOMEKIDS
5          YOJ
6       INCOME
7      PARENT1
8     HOME_VAL
9      MSTATUS
10      GENDER
11   EDUCATION
12  OCCUPATION
13    TRAVTIME
14     CAR_USE
15    BLUEBOOK
16         TIF
17    CAR_TYPE
18     RED_CAR
19    OLDCLAIM
20    CLM_FREQ
21     REVOKED
22     MVR_PTS
23     CLM_AMT
24     CAR_AGE
25  CLAIM_FLAG
26  URBANICITY
                 ID      KIDSDRIV           AGE      HOMEKIDS          YOJ  \
count  1.030200e+04  10302.000000  10295.000000  10302.000000  9754.000000   
mean   4.956631e+08      0.169288     44.837397      0.720443    10.474062   
std    2.864675e+08      0.506512      8.606445      1.116323     4.108943   
min    6.317500e+04      0.000000     16.000000      0.000000     0.000000   
25%    2.442869e+08      0.000000     39.000000      0.000000     9.000000   
50%    4.970043e+08      0.000000     45.000000      0.000000    11.000000   
75%    7

## Separating Data

### X e y

In [197]:
X_all = data.loc[ : , independent_variables]
y_all = data.loc[ : , dependent_variable_flag]

### Test and Validation

In [198]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [199]:
# Creating k_folder
stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = CV_SPLITS, random_state = SEED )

# Data Editing

## Variáveis Numéricas

A implementação de árvore do Sklearn só aceita variáveis numéricas. 

Nessa primeira etapa vamos considerar apenas elas, mas podemos tratar as variáveis categóricas no futuro.

In [200]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 22) para (7211, 8)
Redução de 14 variáveis.


In [203]:
numerical_X.columns

Index(['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS',
       'CAR_AGE'],
      dtype='object')

In [204]:
for col in numerical_X.columns: print (f'{col}\n{numerical_X[col].unique()}') 

KIDSDRIV
[0 2 1 3 4]
AGE
[54.         48.         28.         43.         53.         40.
 41.         50.         42.         35.         51.         52.
 38.         34.         60.         49.         45.         59.
 55.         47.         37.         58.         56.         39.
 46.         22.         44.         67.         62.         36.
 30.         32.         33.         31.         26.         29.
 66.         61.         24.         73.         27.         57.
 65.         69.         25.         63.         21.         20.
 64.         23.         70.         17.         44.88717735 68.
 16.         18.         19.         72.         80.         71.        ]
HOMEKIDS
[0 2 3 1 4 5]
YOJ
[ 6.         12.         10.         14.         11.         10.44948755
 17.          5.         16.          9.          0.         13.
  8.         15.          7.          3.          4.         18.
 19.          1.          2.         23.        ]
TRAVTIME
[ 14  42  50  44  72  37  3

### Colunas Categóricas
Nenhuma colunas numéricas parecem ser uma codificação de categoria.

Vamos analisar as colunas que foram deixadas de lado

In [252]:
categorical_columns = list(set(X_train.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_TYPE,OCCUPATION,BIRTH,REVOKED,GENDER,INCOME,BLUEBOOK,RED_CAR,CAR_USE,EDUCATION,HOME_VAL,MSTATUS,OLDCLAIM,PARENT1
count,7211,6745,7211,7211,7211,6823,7211,7211,7211,7211,6802,7211,7211,7211
unique,6,8,5141,2,2,5862,2671,2,2,5,4497,2,2577,2
top,z_SUV,z_Blue Collar,23AUG60,No,z_F,$0,"$1,500",no,Private,z_High School,$0,Yes,$0,No
freq,2030,1603,6,6312,3903,560,138,5130,4535,2069,2050,4312,4406,6260


As variáveis BIRTH, INCOME, BLUEBOOK, HOME_VAL, OLDCLAIM possuem muitos valores únicos. Dessas, 
INCOME, BLUEBOOK, HOMEVAL e OLDCLAIM são, na verdade, variáveis numéricas(dinheiro)

In [254]:
money_variables = ["INCOME", "BLUEBOOK", "HOME_VAL", "OLDCLAIM"]
money_X =  categorical_X[possible_numerical]
money_X

Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
878,"$73,663","$41,310","$268,990","$36,700"
5358,"$34,669","$8,630","$146,134","$2,499"
1433,"$23,427","$6,640","$98,357","$17,308"
2769,"$54,593","$11,260","$221,267","$1,449"
3326,"$55,770","$12,380","$163,735",$0
...,...,...,...,...
152,"$104,990","$16,080","$306,540",$0
6870,"$152,283","$16,650","$441,904",$0
9663,"$8,585","$13,540",$0,"$4,451"
1431,,"$19,600","$240,520","$3,220"


In [266]:
money_X = money_X.replace('[\$,]', '', regex=True).astype(float)
print(money_X.shape)
money_X.describe()

(7211, 4)


Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
count,6823.0,7211.0,6802.0,7211.0
mean,61485.221603,15708.502288,154265.121729,4045.439606
std,47127.897336,8447.775453,128946.594306,8749.645497
min,0.0,1500.0,0.0,0.0
25%,28116.0,9180.0,0.0,0.0
50%,53643.0,14430.0,161881.5,0.0
75%,85598.0,21015.0,237715.75,4637.0
max,367030.0,69740.0,885282.0,57037.0


In [270]:
numerical_X = pd.concat([numerical_X, money_X], sort = True)

INCOME      388
BLUEBOOK      0
HOME_VAL    409
OLDCLAIM      0
dtype: int64

In [297]:
categorical_columns = list(set(X_train.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_TYPE,OCCUPATION,BIRTH,REVOKED,GENDER,INCOME,BLUEBOOK,RED_CAR,CAR_USE,EDUCATION,HOME_VAL,MSTATUS,OLDCLAIM,PARENT1
count,7211,6745,7211,7211,7211,6823,7211,7211,7211,7211,6802,7211,7211,7211
unique,6,8,5141,2,2,5862,2671,2,2,5,4497,2,2577,2
top,z_SUV,z_Blue Collar,23AUG60,No,z_F,$0,"$1,500",no,Private,z_High School,$0,Yes,$0,No
freq,2030,1603,6,6312,3903,560,138,5130,4535,2069,2050,4312,4406,6260


## Features Dataset

Esse será o dataset usado em nossas análises.

In [272]:
# Replacing NULLS with the mean
numerical_X.fillna(numerical_X.mean(), inplace=True)    
features_X = numerical_X

# Analysis Code

## Decision Tree

In [273]:
cartTree = DecisionTreeClassifier()

In [283]:
pastas = stratified_k_fold.split(features_X, y_train)

In [284]:
tree_train_results = list()
tree_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting Tree")
    cartTree.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = cartTree.predict(X_train_cv)
    tree_train_results.append(score_results(y_train_cv, predicted_train, "CART Tree - Treinamento", verbose=False))
        
    predicted_test = cartTree.predict(X_test_cv)
    tree_test_results.append(score_results(y_test_cv, predicted_test, "CART Tree - Teste", verbose=False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting Tree
--------------------------------------------------


In [285]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in tree_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in tree_test_results])))

Train Balanced Accuraty: 99.92% 
Test Balanced Accuraty: 54.50% 

Train Precision: 100.00% 
Test Precision: 32.94% 

Train Recall: 99.84% 
Test Recall: 34.60% 

Train F1: 99.92% 
Test F1: 33.75% 


## GLM


In [286]:
pastas = stratified_k_fold.split(features_X, y_train)

In [287]:
glm_train_results = list()
glm_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Creating GLM")
    glm = sm.GLM(exog = X_train_cv, endog = y_train_cv, family = sm.families.Binomial())
    
    print("Fitting GLM")
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)
    
    
    predicted_train_probs = predictor_glm.predict(X_train_cv)
    
    predicted_train = (predicted_train_probs > GLM_TRESHOLD)
    glm_train_results.append(score_results(y_train_cv, predicted_train, "GLM - Treinamento"))
        
    predicted_test_probs = predictor_glm.predict(X_test_cv)
    predicted_test =  (predicted_test_probs > GLM_TRESHOLD )
    glm_test_results.append(score_results(y_test_cv, predicted_test, "GLM - Teste", verbose = False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5768
Model:                            GLM   Df Residuals:                     5760
Model Family:                Binomial   Df Model:                            7
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3121.4
Date:                Tue, 07 Jan 2020   Deviance:                       6242.9
Time:                        09:25:23   Pearson chi2:                 5.77e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

--- GLM - Teste ---
Matrix de Confusão
[[1007   50]
 [ 335   49]]
Acurácia Balanceada:  54.02 % 
Falsos Positivos: 50, Falsos Negativos: 335
Verdadeiros Positivos: 49, Verdadeiros Negativos: 1007
Precisao (tp/(tp+fp)): 49.49%
Recall (tp/(tp+fn)): 12.76  %
--------------------------------------------------------------------------------


In [288]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in glm_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in glm_test_results])))

Train Balanced Accuraty: 55.27% 
Test Balanced Accuraty: 55.26% 

Train Precision: 58.90% 
Test Precision: 59.53% 

Train Recall: 14.11% 
Test Recall: 14.10% 

Train F1: 22.77% 
Test F1: 22.75% 


## RBF

In [289]:
import importlib
import RBF

In [229]:
# importlib.reload(RBF)

<module 'RBF' from 'C:\\Users\\barban01\\Desktop\\Projetos\\UFABC\\TCC\\projeto_pdg\\RBF.py'>

In [316]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [317]:
pastas = stratified_k_fold.split(features_X, y_train)

In [318]:
rbf_train_results = list()
rbf_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste", verbose = True))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[137 921]
 [ 39 346]]
Acurácia Balanceada:  51.41 % 
Falsos Positivos: 921, Falsos Negativos: 39
Verdadeiros Positivos: 346, Verdadeiros Negativos: 137
Precisao (tp/(tp+fp)): 27.31%
Recall (tp/(tp+fn)): 89.87  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network


  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  y = column_or_1d(y, warn=True)
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[202 856]
 [ 40 345]]
Acurácia Balanceada:  54.35 % 
Falsos Positivos: 856, Falsos Negativos: 40
Verdadeiros Positivos: 345, Verdadeiros Negativos: 202
Precisao (tp/(tp+fp)): 28.73%
Recall (tp/(tp+fn)): 89.61  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[278 780]
 [ 57 327]]
Acurácia Balanceada:  55.72 % 
Falsos Positivos: 780, Falsos Negativos: 57
Verdadeiros Positivos: 327, Verdadeiros Negativos: 278
Precisao (tp/(tp+fp)): 29.54%
Recall (tp/(tp+fn)): 85.16  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[224 834]
 [ 45 339]]
Acurácia Balanceada:  54.73 % 
Falsos Positivos: 834, Falsos Negativos: 45
Verdadeiros Positivos: 339, Verdadeiros Negativos: 224
Precisao (tp/(tp+fp)): 28.90%
Recall (tp/(tp+fn)): 88.28  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[222 835]
 [ 41 343]]
Acurácia Balanceada:  55.16 % 
Falsos Positivos: 835, Falsos Negativos: 41
Verdadeiros Positivos: 343, Verdadeiros Negativos: 222
Precisao (tp/(tp+fp)): 29.12%
Recall (tp/(tp+fn)): 89.32  %
--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


In [319]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in rbf_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in rbf_test_results])))

Train Balanced Accuraty: 55.16% 
Test Balanced Accuraty: 54.27% 

Train Precision: 29.17% 
Test Precision: 28.72% 

Train Recall: 88.18% 
Test Recall: 88.45% 

Train F1: 43.82% 
Test F1: 43.34% 


# Results

In [320]:
print("<< TREES >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in tree_test_results])))

print("<< GLM >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in glm_test_results])))

print("<< RBF >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in rbf_test_results])))

<< TREES >>
Balanced Accuraty: 54.50% 
Precision: 32.94% 
Recall: 34.60% 
F1: 0.34 
<< GLM >>
Balanced Accuraty: 55.26% 
Precision: 59.53% 
Recall: 14.10% 
F1: 0.23 
<< RBF >>
Balanced Accuraty: 54.27% 
Precision: 28.72% 
Recall: 88.45% 
F1: 0.43 


# Análise Detalhada

In [321]:
print("<< TREE >>")
imprime_matriz_de_confusao_media(tree_test_results)
print("<< GLM >>")
imprime_matriz_de_confusao_media(glm_test_results)
print("<< RBF >>")
imprime_matriz_de_confusao_media(rbf_test_results)

<< TREE >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 2019.00
Total Predicted Negatives: 5192.00

True Positives: 665.00
True Negatives: 3935.00
False Positives: 1354.00
False Negatives: 1257.00

Precision: 0.33
Recall: 0.35
Accuracy Positive(Recall): 0.35
Accuracy Negative: 0.74
Balanced Accuracy: 0.54
F1 Score:  0.34

<< GLM >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 460.00
Total Predicted Negatives: 6751.00

True Positives: 271.00
True Negatives: 5100.00
False Positives: 189.00
False Negatives: 1651.00

Precision: 0.59
Recall: 0.14
Accuracy Positive(Recall): 0.14
Accuracy Negative: 0.96
Balanced Accuracy: 0.55
F1 Score:  0.23

<< RBF >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 5926.00
Total Predicted Negatives: 1285.00

True Positives: 1700.00
True Negatives: 1063.00
False Positives: 4226.00
False Negatives: 222.00

Precision: 