# Prep

## Imports

In [1]:
# Imports
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier; # Tree



# WARNING: This api is different from the SKLEARN 
import statsmodels.api as sm; # GLM


# From Author
from RBF import RBFClassifier
from sklearn.linear_model import LogisticRegression
# Auxiliar
from collections import namedtuple


## Parameters

In [2]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = ["KIDSDRIV", "BIRTH", 
                            "AGE", "HOMEKIDS", 
                            "YOJ", "INCOME", 
                            "PARENT1", "HOME_VAL", 
                            "MSTATUS", "GENDER", 
                            "EDUCATION", "OCCUPATION", 
                            "TRAVTIME", "CAR_USE", 
                            "BLUEBOOK", "TIF", 
                            "CAR_TYPE", "RED_CAR", 
                            "OLDCLAIM",  "REVOKED", 
                            "MVR_PTS", "CAR_AGE"]

dependent_variable_flag = ["CLAIM_FLAG"]
dependent_variable_frequency = ["CLM_FREQ"]
dependent_variable_value = ["CLM_AMT"]

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

## Auxiliar Functions

In [3]:
def score_results(y_real, y_predito, label, verbose = False):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    try:
        tn, fp, fn, tp = matriz_de_confusao.ravel()
    except ValueError:
        print("ValueError: Imprimindo matriz de confusão.")
        print(matriz_de_confusao)
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall", "f1", "confusion_matrix"])
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito)
    precision = tp/(tp+fp)
    recall    = tp/(tp+fn)
    f1 = 2*(precision * recall) / (precision + recall)
    output = Resultados(balanced_accuracy = balanced_accuracy,
                       precision = precision,
                       recall    = recall,
                       f1 =  f1,
                        
                       confusion_matrix = matriz_de_confusao)

    if verbose:
        print(f"--- {label} ---")
        print("Matrix de Confusão")
        print(matriz_de_confusao)
        print("Acurácia Balanceada: ", end=" ")
        print(f"{100*output.balanced_accuracy :.2f} % ")
        print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
        print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
        print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

        print("-"*80)
    return output


In [4]:
def imprime_matriz_de_confusao_media(test_results):
    tns, tps, fps, fns = list(), list(), list(), list()
    for index, result in enumerate(test_results):
        confusion_matrix = result.confusion_matrix
        #print(f"Fit {index}")
        #print(confusion_matrix)
        tnl, fpl, fnl, tpl = confusion_matrix.ravel()
        tns.append(tnl)
        fps.append(fpl)
        fns.append(fnl)
        tps.append(tpl)

    tn = np.sum(tns)
    tp = np.sum(tps)
    fp = np.sum(fps)
    fn = np.sum(fns)
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall) / (precision+recall)
    print("Médias")
    print(f"Total Real Positives: {tp+fn :.2f}\nTotal Real Negatives: {tn+fp:.2f}\n" +
          f"Total Predicted Positives: {tp+fp:.2f}\nTotal Predicted Negatives: {tn+fn:.2f}\n" +
          "\n"
          f"True Positives: {tp:.2f}\nTrue Negatives: {tn:.2f}\nFalse Positives: {fp:.2f}\nFalse Negatives: {fn:.2f}")
    print("")
    print(f"Precision: { precision :.2f}\nRecall: { recall :.2f}")
    print(f"Accuracy Positive(Recall): {tp/(tp+fn):.2f}\nAccuracy Negative: {tn/(tn+fp):.2f}")
    print(f"Balanced Accuracy: {((tp/(tp+fn)) + (tn/(tn+fp)))/2 :.2f}")
    print(f"F1 Score: {f1 : .2f}")
    print("")
 



## Loading Dataset

In [5]:
data = pd.read_csv(data_under_analysis)

In [6]:
# Linhas / Colunas
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
print(pd.DataFrame(data.columns))
print(data.describe())
print(data.describe().shape[1]) # Colunas Númericas


10302 linhas e 27 colunas
             0
0           ID
1     KIDSDRIV
2        BIRTH
3          AGE
4     HOMEKIDS
5          YOJ
6       INCOME
7      PARENT1
8     HOME_VAL
9      MSTATUS
10      GENDER
11   EDUCATION
12  OCCUPATION
13    TRAVTIME
14     CAR_USE
15    BLUEBOOK
16         TIF
17    CAR_TYPE
18     RED_CAR
19    OLDCLAIM
20    CLM_FREQ
21     REVOKED
22     MVR_PTS
23     CLM_AMT
24     CAR_AGE
25  CLAIM_FLAG
26  URBANICITY
                 ID      KIDSDRIV           AGE      HOMEKIDS          YOJ  \
count  1.030200e+04  10302.000000  10295.000000  10302.000000  9754.000000   
mean   4.956631e+08      0.169288     44.837397      0.720443    10.474062   
std    2.864675e+08      0.506512      8.606445      1.116323     4.108943   
min    6.317500e+04      0.000000     16.000000      0.000000     0.000000   
25%    2.442869e+08      0.000000     39.000000      0.000000     9.000000   
50%    4.970043e+08      0.000000     45.000000      0.000000    11.000000   
75%    7

## Separating Data

### X e y

In [7]:
X_all = data.loc[ : , independent_variables]
y_all = data.loc[ : , dependent_variable_flag]

### Test and Validation

In [8]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [9]:
# Creating k_folder
stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = CV_SPLITS, random_state = SEED )

# Data Editing

## Variáveis Numéricas

A implementação de árvore do Sklearn só aceita variáveis numéricas. 

Nessa primeira etapa vamos considerar apenas elas, mas podemos tratar as variáveis categóricas no futuro.

In [10]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 22) para (7211, 8)
Redução de 14 variáveis.


In [11]:
numerical_X.columns

Index(['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS',
       'CAR_AGE'],
      dtype='object')

In [12]:
for col in numerical_X.columns: print (f'{col}\n{numerical_X[col].unique()}') 

KIDSDRIV
[0 2 1 3 4]
AGE
[54. 48. 28. 43. 53. 40. 41. 50. 42. 35. 51. 52. 38. 34. 60. 49. 45. 59.
 55. 47. 37. 58. 56. 39. 46. 22. 44. 67. 62. 36. 30. 32. 33. 31. 26. 29.
 66. 61. 24. 73. 27. 57. 65. 69. 25. 63. 21. 20. 64. 23. 70. 17. nan 68.
 16. 18. 19. 72. 80. 71.]
HOMEKIDS
[0 2 3 1 4 5]
YOJ
[ 6. 12. 10. 14. 11. nan 17.  5. 16.  9.  0. 13.  8. 15.  7.  3.  4. 18.
 19.  1.  2. 23.]
TRAVTIME
[ 14  42  50  44  72  37  35  52  30  33  41  29  36  46  39  19  38  21
  22  48  76  10  27  16  45  34  59  56  13  49   5  23  31  28  18  32
  15  88  53  63  55  40  64  58  25  82  62  69   6  43  60  57  70  26
  65  20  17  11  51   9  12  47  71  87  24   8  66   7  67  61  54  86
  78  85 134  77  74  68  75  81  89  79  80  97  84  90 105 103  73 124
  91  92  95]
TIF
[ 7  1  4  6  3  9 13 17  8 11 10 14 12  5 15 19  2 21 16 18 20 25 22]
MVR_PTS
[ 3  1  6  4  0  5  2  9  8  7 11 10 12 13]
CAR_AGE
[ 5.  1. 11. 12.  6. 10. 15.  7.  8. nan 14.  9. 17. 16. 19. -3. 13. 18.
  4. 22.  0. 21.

### Colunas Categóricas
Nenhuma colunas numéricas parecem ser uma codificação de categoria.

Vamos analisar as colunas que foram deixadas de lado

In [13]:
categorical_columns = list(set(X_train.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_USE,PARENT1,EDUCATION,OCCUPATION,HOME_VAL,MSTATUS,RED_CAR,BIRTH,OLDCLAIM,BLUEBOOK,GENDER,REVOKED,INCOME,CAR_TYPE
count,7211,7211,7211,6745,6802,7211,7211,7211,7211,7211,7211,7211,6823,7211
unique,2,2,5,8,4497,2,2,5141,2577,2671,2,2,5862,6
top,Private,No,z_High School,z_Blue Collar,$0,Yes,no,31JUL54,$0,"$1,500",z_F,No,$0,z_SUV
freq,4535,6260,2069,1603,2050,4312,5130,6,4406,138,3903,6312,560,2030


As variáveis BIRTH, INCOME, BLUEBOOK, HOME_VAL, OLDCLAIM possuem muitos valores únicos. Dessas, 
INCOME, BLUEBOOK, HOMEVAL e OLDCLAIM são, na verdade, variáveis numéricas(dinheiro)

In [34]:
money_variables = ["INCOME", "BLUEBOOK", "HOME_VAL", "OLDCLAIM"]
money_X =  categorical_X[money_variables]
money_X

Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
878,"$73,663","$41,310","$268,990","$36,700"
5358,"$34,669","$8,630","$146,134","$2,499"
1433,"$23,427","$6,640","$98,357","$17,308"
2769,"$54,593","$11,260","$221,267","$1,449"
3326,"$55,770","$12,380","$163,735",$0
...,...,...,...,...
152,"$104,990","$16,080","$306,540",$0
6870,"$152,283","$16,650","$441,904",$0
9663,"$8,585","$13,540",$0,"$4,451"
1431,,"$19,600","$240,520","$3,220"


In [35]:
money_X = money_X.replace('[\$,]', '', regex=True).astype(float)
print(money_X.shape)
money_X.describe()

(7211, 4)


Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
count,6823.0,7211.0,6802.0,7211.0
mean,61485.221603,15708.502288,154265.121729,4045.439606
std,47127.897336,8447.775453,128946.594306,8749.645497
min,0.0,1500.0,0.0,0.0
25%,28116.0,9180.0,0.0,0.0
50%,53643.0,14430.0,161881.5,0.0
75%,85598.0,21015.0,237715.75,4637.0
max,367030.0,69740.0,885282.0,57037.0


In [36]:
numerical_X = numerical_X.join(money_X, on = None)

In [37]:
categorical_columns = list(set(X_train.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_USE,PARENT1,EDUCATION,OCCUPATION,MSTATUS,RED_CAR,BIRTH,GENDER,REVOKED,CAR_TYPE
count,7211,7211,7211,6745,7211,7211,7211,7211,7211,7211
unique,2,2,5,8,2,2,5141,2,2,6
top,Private,No,z_High School,z_Blue Collar,Yes,no,31JUL54,z_F,No,z_SUV
freq,4535,6260,2069,1603,4312,5130,6,3903,6312,2030


## Features Dataset

Esse será o dataset usado em nossas análises.

In [40]:
# Replacing NULLS with the mean
numerical_X.fillna(numerical_X.mean(), inplace=True)    
features_X = numerical_X

In [41]:
features_X

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,TRAVTIME,TIF,MVR_PTS,CAR_AGE,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
878,0,54.0,0,6.000000,14,7,3,5.0,73663.000000,41310.0,268990.0,36700.0
5358,0,48.0,0,12.000000,42,1,1,1.0,34669.000000,8630.0,146134.0,2499.0
1433,0,28.0,2,10.000000,50,4,6,1.0,23427.000000,6640.0,98357.0,17308.0
2769,0,43.0,2,14.000000,14,1,4,11.0,54593.000000,11260.0,221267.0,1449.0
3326,0,53.0,0,14.000000,44,6,0,12.0,55770.000000,12380.0,163735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
152,0,46.0,0,10.000000,40,13,0,7.0,104990.000000,16080.0,306540.0,0.0
6870,0,50.0,0,10.449488,12,1,0,18.0,152283.000000,16650.0,441904.0,0.0
9663,0,40.0,0,12.000000,34,7,0,1.0,8585.000000,13540.0,0.0,4451.0
1431,0,41.0,2,13.000000,25,4,0,7.0,61485.221603,19600.0,240520.0,3220.0


# Analysis Code

## Decision Tree

In [42]:
cartTree = DecisionTreeClassifier()

In [43]:
pastas = stratified_k_fold.split(features_X, y_train)

In [44]:
tree_train_results = list()
tree_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting Tree")
    cartTree.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = cartTree.predict(X_train_cv)
    tree_train_results.append(score_results(y_train_cv, predicted_train, "CART Tree - Treinamento", verbose=False))
        
    predicted_test = cartTree.predict(X_test_cv)
    tree_test_results.append(score_results(y_test_cv, predicted_test, "CART Tree - Teste", verbose=False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting Tree
--------------------------------------------------


In [45]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in tree_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in tree_test_results])))

Train Balanced Accuraty: 100.00% 
Test Balanced Accuraty: 58.98% 

Train Precision: 100.00% 
Test Precision: 39.43% 

Train Recall: 100.00% 
Test Recall: 40.84% 

Train F1: 100.00% 
Test F1: 40.08% 


## GLM


In [46]:
pastas = stratified_k_fold.split(features_X, y_train)

In [47]:
glm_train_results = list()
glm_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Creating GLM")
    glm = sm.GLM(exog = X_train_cv, endog = y_train_cv, family = sm.families.Binomial())
    
    print("Fitting GLM")
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)
    
    
    predicted_train_probs = predictor_glm.predict(X_train_cv)
    
    predicted_train = (predicted_train_probs > GLM_TRESHOLD)
    glm_train_results.append(score_results(y_train_cv, predicted_train, "GLM - Treinamento"))
        
    predicted_test_probs = predictor_glm.predict(X_test_cv)
    predicted_test =  (predicted_test_probs > GLM_TRESHOLD )
    glm_test_results.append(score_results(y_test_cv, predicted_test, "GLM - Teste", verbose = False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5768
Model:                            GLM   Df Residuals:                     5756
Model Family:                Binomial   Df Model:                           11
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3037.3
Date:                Wed, 08 Jan 2020   Deviance:                       6074.6
Time:                        08:57:10   Pearson chi2:                 5.79e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

--------------------------------------------------------------------------------
K Fold - Rodada 4

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5770
Model:                            GLM   Df Residuals:                     5758
Model Family:                Binomial   Df Model:                           11
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3043.0
Date:                Wed, 08 Jan 2020   Deviance:                       6086.0
Time:                        08:57:11   Pearson chi2:                 5.80e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

In [48]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in glm_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in glm_test_results])))

Train Balanced Accuraty: 56.52% 
Test Balanced Accuraty: 56.40% 

Train Precision: 56.78% 
Test Precision: 56.93% 

Train Recall: 18.03% 
Test Recall: 17.85% 

Train F1: 27.36% 
Test F1: 27.10% 


## RBF

In [59]:
import importlib
import RBF

In [60]:
importlib.reload(RBF)

<module 'RBF' from 'C:\\Users\\barban01\\Desktop\\Projetos\\UFABC\\TCC\\projeto_pdg\\RBF.py'>

In [61]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [62]:
pastas = stratified_k_fold.split(features_X, y_train)

In [63]:
rbf_train_results = list()
rbf_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = numerical_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = numerical_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste", verbose = True))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting RBF Network


  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  y = column_or_1d(y, warn=True)
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1057    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1057
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------


  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  y = column_or_1d(y, warn=True)
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))
  normalization_constant = 1/(2*np.pi*variance)
  return normalization_constant * np.exp(-(dist/gamma))
  return normalization_constant * np.exp(-(dist/gamma))


In [53]:
rbf_train_results = list()
rbf_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste", verbose = True))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1057    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1057
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


In [54]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in rbf_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in rbf_test_results])))

Train Balanced Accuraty: 50.00% 
Test Balanced Accuraty: 50.00% 

Train Precision: nan% 
Test Precision: nan% 

Train Recall: 0.00% 
Test Recall: 0.00% 

Train F1: nan% 
Test F1: nan% 


# Results

In [32]:
print("<< TREES >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in tree_test_results])))

print("<< GLM >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in glm_test_results])))

print("<< RBF >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in rbf_test_results])))

<< TREES >>
Balanced Accuraty: 54.97% 
Precision: 33.61% 
Recall: 35.27% 
F1: 0.34 
<< GLM >>
Balanced Accuraty: 55.26% 
Precision: 59.53% 
Recall: 14.10% 
F1: 0.23 
<< RBF >>
Balanced Accuraty: 54.27% 
Precision: 28.72% 
Recall: 88.45% 
F1: 0.43 


# Análise Detalhada

In [33]:
print("<< TREE >>")
imprime_matriz_de_confusao_media(tree_test_results)
print("<< GLM >>")
imprime_matriz_de_confusao_media(glm_test_results)
print("<< RBF >>")
imprime_matriz_de_confusao_media(rbf_test_results)

<< TREE >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 2018.00
Total Predicted Negatives: 5193.00

True Positives: 678.00
True Negatives: 3949.00
False Positives: 1340.00
False Negatives: 1244.00

Precision: 0.34
Recall: 0.35
Accuracy Positive(Recall): 0.35
Accuracy Negative: 0.75
Balanced Accuracy: 0.55
F1 Score:  0.34

<< GLM >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 460.00
Total Predicted Negatives: 6751.00

True Positives: 271.00
True Negatives: 5100.00
False Positives: 189.00
False Negatives: 1651.00

Precision: 0.59
Recall: 0.14
Accuracy Positive(Recall): 0.14
Accuracy Negative: 0.96
Balanced Accuracy: 0.55
F1 Score:  0.23

<< RBF >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 5926.00
Total Predicted Negatives: 1285.00

True Positives: 1700.00
True Negatives: 1063.00
False Positives: 4226.00
False Negatives: 222.00

Precision: 