# Prep

## Imports

In [1]:
# Imports
import pandas as pd;
import numpy as np; 

import sklearn.preprocessing
import sklearn.model_selection;
import sklearn.metrics;
from sklearn.linear_model import LogisticRegression

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier; # Tree



# WARNING: This api is different from the SKLEARN 
import statsmodels.api as sm; # GLM


# Auxiliar
from collections import namedtuple
import importlib

# From Author
import RBF


## Parameters

In [2]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = ["KIDSDRIV", "BIRTH", 
                            "AGE", "HOMEKIDS", 
                            "YOJ", "INCOME", 
                            "PARENT1", "HOME_VAL", 
                            "MSTATUS", "GENDER", 
                            "EDUCATION", "OCCUPATION", 
                            "TRAVTIME", "CAR_USE", 
                            "BLUEBOOK", "TIF", 
                            "CAR_TYPE", "RED_CAR", 
                            "OLDCLAIM",  "REVOKED", 
                            "MVR_PTS", "CAR_AGE"]

dependent_variable_flag = ["CLAIM_FLAG"]
dependent_variable_frequency = ["CLM_FREQ"]
dependent_variable_value = ["CLM_AMT"]

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

## Auxiliar Functions

In [3]:
def score_results(y_real, y_predito, label, verbose = False):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    try:
        tn, fp, fn, tp = matriz_de_confusao.ravel()
    except ValueError:
        print("ValueError: Imprimindo matriz de confusão.")
        print(matriz_de_confusao)
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall", "f1", "confusion_matrix"])
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito)
    precision = tp/(tp+fp)
    recall    = tp/(tp+fn)
    f1 = 2*(precision * recall) / (precision + recall)
    output = Resultados(balanced_accuracy = balanced_accuracy,
                       precision = precision,
                       recall    = recall,
                       f1 =  f1,
                        
                       confusion_matrix = matriz_de_confusao)

    if verbose:
        print(f"--- {label} ---")
        print("Matrix de Confusão")
        print(matriz_de_confusao)
        print("Acurácia Balanceada: ", end=" ")
        print(f"{100*output.balanced_accuracy :.2f} % ")
        print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
        print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
        print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

        print("-"*80)
    return output


In [4]:
def imprime_matriz_de_confusao_media(test_results):
    tns, tps, fps, fns = list(), list(), list(), list()
    for index, result in enumerate(test_results):
        confusion_matrix = result.confusion_matrix
        #print(f"Fit {index}")
        #print(confusion_matrix)
        tnl, fpl, fnl, tpl = confusion_matrix.ravel()
        tns.append(tnl)
        fps.append(fpl)
        fns.append(fnl)
        tps.append(tpl)

    tn = np.sum(tns)
    tp = np.sum(tps)
    fp = np.sum(fps)
    fn = np.sum(fns)
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall) / (precision+recall)
    print("Médias")
    print(f"Total Real Positives: {tp+fn :.2f}\nTotal Real Negatives: {tn+fp:.2f}\n" +
          f"Total Predicted Positives: {tp+fp:.2f}\nTotal Predicted Negatives: {tn+fn:.2f}\n" +
          "\n"
          f"True Positives: {tp:.2f}\nTrue Negatives: {tn:.2f}\nFalse Positives: {fp:.2f}\nFalse Negatives: {fn:.2f}")
    print("")
    print(f"Precision: { precision :.2f}\nRecall: { recall :.2f}")
    print(f"Accuracy Positive(Recall): {tp/(tp+fn):.2f}\nAccuracy Negative: {tn/(tn+fp):.2f}")
    print(f"Balanced Accuracy: {((tp/(tp+fn)) + (tn/(tn+fp)))/2 :.2f}")
    print(f"F1 Score: {f1 : .2f}")
    print("")
 



## Loading Dataset

In [5]:
data = pd.read_csv(data_under_analysis)

In [6]:
# Linhas / Colunas
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
print(pd.DataFrame(data.columns))
print(data.describe())
print(data.describe().shape[1]) # Colunas Númericas


10302 linhas e 27 colunas
             0
0           ID
1     KIDSDRIV
2        BIRTH
3          AGE
4     HOMEKIDS
5          YOJ
6       INCOME
7      PARENT1
8     HOME_VAL
9      MSTATUS
10      GENDER
11   EDUCATION
12  OCCUPATION
13    TRAVTIME
14     CAR_USE
15    BLUEBOOK
16         TIF
17    CAR_TYPE
18     RED_CAR
19    OLDCLAIM
20    CLM_FREQ
21     REVOKED
22     MVR_PTS
23     CLM_AMT
24     CAR_AGE
25  CLAIM_FLAG
26  URBANICITY
                 ID      KIDSDRIV           AGE      HOMEKIDS          YOJ  \
count  1.030200e+04  10302.000000  10295.000000  10302.000000  9754.000000   
mean   4.956631e+08      0.169288     44.837397      0.720443    10.474062   
std    2.864675e+08      0.506512      8.606445      1.116323     4.108943   
min    6.317500e+04      0.000000     16.000000      0.000000     0.000000   
25%    2.442869e+08      0.000000     39.000000      0.000000     9.000000   
50%    4.970043e+08      0.000000     45.000000      0.000000    11.000000   
75%    7

## Separating Data

### X e y

In [7]:
X_all = data.loc[ : , independent_variables]
y_all = data.loc[ : , dependent_variable_flag]

### Test and Validation

In [8]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [9]:
# Creating k_folder
stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = CV_SPLITS, random_state = SEED )

# Data Editing

## Variáveis Numéricas

A implementação de árvore do Sklearn só aceita variáveis numéricas. 

Nessa primeira etapa vamos considerar apenas elas, mas podemos tratar as variáveis categóricas no futuro.

In [10]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 22) para (7211, 8)
Redução de 14 variáveis.


In [11]:
numerical_X.columns

Index(['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS',
       'CAR_AGE'],
      dtype='object')

In [12]:
for col in numerical_X.columns: print (f'{col}\n{numerical_X[col].unique()}') 

KIDSDRIV
[0 2 1 3 4]
AGE
[54. 48. 28. 43. 53. 40. 41. 50. 42. 35. 51. 52. 38. 34. 60. 49. 45. 59.
 55. 47. 37. 58. 56. 39. 46. 22. 44. 67. 62. 36. 30. 32. 33. 31. 26. 29.
 66. 61. 24. 73. 27. 57. 65. 69. 25. 63. 21. 20. 64. 23. 70. 17. nan 68.
 16. 18. 19. 72. 80. 71.]
HOMEKIDS
[0 2 3 1 4 5]
YOJ
[ 6. 12. 10. 14. 11. nan 17.  5. 16.  9.  0. 13.  8. 15.  7.  3.  4. 18.
 19.  1.  2. 23.]
TRAVTIME
[ 14  42  50  44  72  37  35  52  30  33  41  29  36  46  39  19  38  21
  22  48  76  10  27  16  45  34  59  56  13  49   5  23  31  28  18  32
  15  88  53  63  55  40  64  58  25  82  62  69   6  43  60  57  70  26
  65  20  17  11  51   9  12  47  71  87  24   8  66   7  67  61  54  86
  78  85 134  77  74  68  75  81  89  79  80  97  84  90 105 103  73 124
  91  92  95]
TIF
[ 7  1  4  6  3  9 13 17  8 11 10 14 12  5 15 19  2 21 16 18 20 25 22]
MVR_PTS
[ 3  1  6  4  0  5  2  9  8  7 11 10 12 13]
CAR_AGE
[ 5.  1. 11. 12.  6. 10. 15.  7.  8. nan 14.  9. 17. 16. 19. -3. 13. 18.
  4. 22.  0. 21.

### Colunas Categóricas
Nenhuma colunas numéricas parecem ser uma codificação de categoria.

Vamos analisar as colunas que foram deixadas de lado

In [13]:
categorical_columns = list(set(X_train.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_USE,CAR_TYPE,PARENT1,OLDCLAIM,EDUCATION,MSTATUS,RED_CAR,GENDER,INCOME,HOME_VAL,OCCUPATION,BIRTH,REVOKED,BLUEBOOK
count,7211,7211,7211,7211,7211,7211,7211,7211,6823,6802,6745,7211,7211,7211
unique,2,6,2,2577,5,2,2,2,5862,4497,8,5141,2,2671
top,Private,z_SUV,No,$0,z_High School,Yes,no,z_F,$0,$0,z_Blue Collar,23AUG60,No,"$1,500"
freq,4535,2030,6260,4406,2069,4312,5130,3903,560,2050,1603,6,6312,138


Note que possuímos a variável AGE nas nossas variáveis numéricas. AGE está diretamente relacionada à BIRTH, por isso podemos excluir essa variável

In [14]:
categorical_X = categorical_X.drop("BIRTH", axis = 1, inplace = False)

As variáveis BIRTH, INCOME, BLUEBOOK, HOME_VAL, OLDCLAIM possuem muitos valores únicos. Dessas, 
INCOME, BLUEBOOK, HOMEVAL e OLDCLAIM são, na verdade, variáveis numéricas(dinheiro)

In [15]:
money_variables = ["INCOME", "BLUEBOOK", "HOME_VAL", "OLDCLAIM"]
money_X =  categorical_X[money_variables]
money_X

Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
878,"$73,663","$41,310","$268,990","$36,700"
5358,"$34,669","$8,630","$146,134","$2,499"
1433,"$23,427","$6,640","$98,357","$17,308"
2769,"$54,593","$11,260","$221,267","$1,449"
3326,"$55,770","$12,380","$163,735",$0
...,...,...,...,...
152,"$104,990","$16,080","$306,540",$0
6870,"$152,283","$16,650","$441,904",$0
9663,"$8,585","$13,540",$0,"$4,451"
1431,,"$19,600","$240,520","$3,220"


In [16]:
money_X = money_X.replace('[\$,]', '', regex=True).astype(float)
print(money_X.shape)
money_X.describe()

(7211, 4)


Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
count,6823.0,7211.0,6802.0,7211.0
mean,61485.221603,15708.502288,154265.121729,4045.439606
std,47127.897336,8447.775453,128946.594306,8749.645497
min,0.0,1500.0,0.0,0.0
25%,28116.0,9180.0,0.0,0.0
50%,53643.0,14430.0,161881.5,0.0
75%,85598.0,21015.0,237715.75,4637.0
max,367030.0,69740.0,885282.0,57037.0


In [17]:
before = numerical_X.shape
numerical_X = numerical_X.join(money_X, on = None)
after = numerical_X.shape
print(f"Alterado de {str(before)} para {str(after)}")

Alterado de (7211, 8) para (7211, 12)


In [18]:
categorical_columns = list(set(categorical_X.columns) - set(numerical_X.columns))
categorical_X = X_train[categorical_columns]
categorical_X.describe()

Unnamed: 0,CAR_TYPE,CAR_USE,PARENT1,EDUCATION,MSTATUS,RED_CAR,GENDER,OCCUPATION,REVOKED
count,7211,7211,7211,7211,7211,7211,7211,6745,7211
unique,6,2,2,5,2,2,2,8,2
top,z_SUV,Private,No,z_High School,Yes,no,z_F,z_Blue Collar,No
freq,2030,4535,6260,2069,4312,5130,3903,1603,6312


Existem valores null?

In [19]:
categorical_X.isnull().sum()

CAR_TYPE        0
CAR_USE         0
PARENT1         0
EDUCATION       0
MSTATUS         0
RED_CAR         0
GENDER          0
OCCUPATION    466
REVOKED         0
dtype: int64

In [20]:
categorical_X["OCCUPATION"].unique()

array(['Professional', 'Clerical', 'Lawyer', 'z_Blue Collar', 'Doctor',
       'Home Maker', 'Manager', nan, 'Student'], dtype=object)

In [21]:
categorical_X.loc[ : , "OCCUPATION"] = categorical_X.loc[ : , "OCCUPATION"].fillna(value = "Unknown", inplace = False)
categorical_X["OCCUPATION"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


array(['Professional', 'Clerical', 'Lawyer', 'z_Blue Collar', 'Doctor',
       'Home Maker', 'Manager', 'Unknown', 'Student'], dtype=object)

Agora podemos transformar essas variáveis categóricas em numéricas. Para isso vamos usar o OneHotEncoder

In [22]:
encoder = sklearn.preprocessing.OneHotEncoder(categories = "auto", drop = "first", handle_unknown = "error", sparse = False)
encoder.fit(categorical_X)
encoder.categories_
encoder.get_feature_names()

array(['x0_Panel Truck', 'x0_Pickup', 'x0_Sports Car', 'x0_Van',
       'x0_z_SUV', 'x1_Private', 'x2_Yes', 'x3_Bachelors', 'x3_Masters',
       'x3_PhD', 'x3_z_High School', 'x4_z_No', 'x5_yes', 'x6_z_F',
       'x7_Doctor', 'x7_Home Maker', 'x7_Lawyer', 'x7_Manager',
       'x7_Professional', 'x7_Student', 'x7_Unknown', 'x7_z_Blue Collar',
       'x8_Yes'], dtype=object)

In [23]:
encoded_X = pd.DataFrame(encoder.transform(categorical_X))
encoded_X.index = categorical_X.index

In [24]:
encoded_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
878,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5358,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1433,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2769,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3326,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6870,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1431,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Features Dataset

Esse será o dataset usado em nossas análises.

In [25]:
# Replacing NULLS with the mean
numerical_X.fillna(numerical_X.mean(), inplace=True)    
features_X = numerical_X.join(encoded_X, on = None)
features_X

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,TRAVTIME,TIF,MVR_PTS,CAR_AGE,INCOME,BLUEBOOK,...,13,14,15,16,17,18,19,20,21,22
878,0,54.0,0,6.000000,14,7,3,5.0,73663.000000,41310.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5358,0,48.0,0,12.000000,42,1,1,1.0,34669.000000,8630.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1433,0,28.0,2,10.000000,50,4,6,1.0,23427.000000,6640.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2769,0,43.0,2,14.000000,14,1,4,11.0,54593.000000,11260.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3326,0,53.0,0,14.000000,44,6,0,12.0,55770.000000,12380.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0,46.0,0,10.000000,40,13,0,7.0,104990.000000,16080.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6870,0,50.0,0,10.449488,12,1,0,18.0,152283.000000,16650.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9663,0,40.0,0,12.000000,34,7,0,1.0,8585.000000,13540.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1431,0,41.0,2,13.000000,25,4,0,7.0,61485.221603,19600.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Analysis Code - Without Normalization

In [26]:
features_X_old = features_X

## Decision Tree

In [27]:
cartTree = DecisionTreeClassifier()

In [28]:
tree_train_results = list()
tree_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting Tree")
    cartTree.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = cartTree.predict(X_train_cv)
    tree_train_results.append(score_results(y_train_cv, predicted_train, "CART Tree - Treinamento", verbose=False))
        
    predicted_test = cartTree.predict(X_test_cv)
    tree_test_results.append(score_results(y_test_cv, predicted_test, "CART Tree - Teste", verbose=False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting Tree
--------------------------------------------------


In [29]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in tree_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in tree_test_results])))

Train Balanced Accuraty: 100.00% 
Test Balanced Accuraty: 60.10% 

Train Precision: 100.00% 
Test Precision: 41.07% 

Train Recall: 100.00% 
Test Recall: 42.30% 

Train F1: 100.00% 
Test F1: 41.64% 


## GLM -> TODO: CORRIGIR ERRO DE INDICES


In [30]:
glm_train_results = list()
glm_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Creating GLM")
    glm = sm.GLM(exog = X_train_cv, endog = y_train_cv, family = sm.families.Binomial())
    
    print("Fitting GLM")
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)
    
    
    predicted_train_probs = predictor_glm.predict(X_train_cv)
    
    predicted_train = (predicted_train_probs > GLM_TRESHOLD)
    glm_train_results.append(score_results(y_train_cv, predicted_train, "GLM - Treinamento"))
        
    predicted_test_probs = predictor_glm.predict(X_test_cv)
    predicted_test =  (predicted_test_probs > GLM_TRESHOLD )
    glm_test_results.append(score_results(y_test_cv, predicted_test, "GLM - Teste", verbose = False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5768
Model:                            GLM   Df Residuals:                     5733
Model Family:                Binomial   Df Model:                           34
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2860.0
Date:                Thu, 09 Jan 2020   Deviance:                       5719.9
Time:                        09:25:44   Pearson chi2:                 5.77e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

--------------------------------------------------------------------------------
K Fold - Rodada 2

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5769
Model:                            GLM   Df Residuals:                     5734
Model Family:                Binomial   Df Model:                           34
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2876.6
Date:                Thu, 09 Jan 2020   Deviance:                       5753.1
Time:                        09:25:44   Pearson chi2:                 5.76e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

In [31]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in glm_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in glm_test_results])))

Train Balanced Accuraty: 60.96% 
Test Balanced Accuraty: 60.51% 

Train Precision: 62.19% 
Test Precision: 60.86% 

Train Recall: 28.12% 
Test Recall: 27.52% 

Train F1: 38.72% 
Test F1: 37.87% 


## RBF

In [32]:
# importlib.reload(RBF)

In [37]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [38]:
rbf_train_results = list()
rbf_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste", verbose = True))


--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 385    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 385
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1058    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1058
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[1057    0]
 [ 384    0]]
Acurácia Balanceada:  50.00 % 
Falsos Positivos: 0, Falsos Negativos: 384
Verdadeiros Positivos: 0, Verdadeiros Negativos: 1057
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00  %
--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


In [39]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("")
print("Train F1: {:.2f}% ".format(100*np.mean([result.f1 for result in rbf_train_results])))
print("Test F1: {:.2f}% ".format(100*np.mean([result.f1  for result in rbf_test_results])))

Train Balanced Accuraty: 50.00% 
Test Balanced Accuraty: 50.00% 

Train Precision: nan% 
Test Precision: nan% 

Train Recall: 0.00% 
Test Recall: 0.00% 

Train F1: nan% 
Test F1: nan% 


## Results Without Normalization

In [40]:
print("<< TREES >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in tree_test_results])))

print("<< GLM >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in glm_test_results])))

print("<< RBF >>")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("F1: {:.2f} ".format(np.mean([result.f1 for result in rbf_test_results])))

<< TREES >>
Balanced Accuraty: 60.10% 
Precision: 41.07% 
Recall: 42.30% 
F1: 0.42 
<< GLM >>
Balanced Accuraty: 60.51% 
Precision: 60.86% 
Recall: 27.52% 
F1: 0.38 
<< RBF >>
Balanced Accuraty: 50.00% 
Precision: nan% 
Recall: 0.00% 
F1: nan 


## Detailed Analysis

In [41]:
print("<< TREE >>")
imprime_matriz_de_confusao_media(tree_test_results)
print("<< GLM >>")
imprime_matriz_de_confusao_media(glm_test_results)
print("<< RBF >>")
imprime_matriz_de_confusao_media(rbf_test_results)

<< TREE >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 1982.00
Total Predicted Negatives: 5229.00

True Positives: 813.00
True Negatives: 4120.00
False Positives: 1169.00
False Negatives: 1109.00

Precision: 0.41
Recall: 0.42
Accuracy Positive(Recall): 0.42
Accuracy Negative: 0.78
Balanced Accuracy: 0.60
F1 Score:  0.42

<< GLM >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 873.00
Total Predicted Negatives: 6338.00

True Positives: 529.00
True Negatives: 4945.00
False Positives: 344.00
False Negatives: 1393.00

Precision: 0.61
Recall: 0.28
Accuracy Positive(Recall): 0.28
Accuracy Negative: 0.93
Balanced Accuracy: 0.61
F1 Score:  0.38

<< RBF >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 0.00
Total Predicted Negatives: 7211.00

True Positives: 0.00
True Negatives: 5289.00
False Positives: 0.00
False Negatives: 1922.00

Precision: nan
Reca



# Analysis - Normalized

In [45]:
features_X_old = features_X
features_X = pd.DataFrame(sklearn.preprocessing.scale(features_X))

## Decision Tree

In [46]:
cartTree = DecisionTreeClassifier()

In [47]:
tree_train_results = list()
tree_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting Tree")
    cartTree.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = cartTree.predict(X_train_cv)
    tree_train_results.append(score_results(y_train_cv, predicted_train, "CART Tree - Treinamento", verbose=False))
        
    predicted_test = cartTree.predict(X_test_cv)
    tree_test_results.append(score_results(y_test_cv, predicted_test, "CART Tree - Teste", verbose=False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting Tree
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting Tree
--------------------------------------------------


## GLM


In [48]:
glm_train_results = list()
glm_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Creating GLM")
    glm = sm.GLM(exog = X_train_cv, endog = y_train_cv, family = sm.families.Binomial())
    
    print("Fitting GLM")
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)
    
    
    predicted_train_probs = predictor_glm.predict(X_train_cv)
    
    predicted_train = (predicted_train_probs > GLM_TRESHOLD)
    glm_train_results.append(score_results(y_train_cv, predicted_train, "GLM - Treinamento"))
        
    predicted_test_probs = predictor_glm.predict(X_test_cv)
    predicted_test =  (predicted_test_probs > GLM_TRESHOLD )
    glm_test_results.append(score_results(y_test_cv, predicted_test, "GLM - Teste", verbose = False))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Creating GLM


ValueError: The indices for endog and exog are not aligned

## RBF

In [49]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [50]:
rbf_train_results = list()
rbf_test_results = list()
pastas = stratified_k_fold.split(features_X, y_train)

for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = features_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = features_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste", verbose = True))


--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[122 936]
 [ 12 373]]
Acurácia Balanceada:  54.21 % 
Falsos Positivos: 936, Falsos Negativos: 12
Verdadeiros Positivos: 373, Verdadeiros Negativos: 122
Precisao (tp/(tp+fp)): 28.50%
Recall (tp/(tp+fn)): 96.88  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[119 939]
 [ 13 372]]
Acurácia Balanceada:  53.94 % 
Falsos Positivos: 939, Falsos Negativos: 13
Verdadeiros Positivos: 372, Verdadeiros Negativos: 119
Precisao (tp/(tp+fp)): 28.38%
Recall (tp/(tp+fn)): 96.62  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 2

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[156 902]
 [ 19 365]]
Acurácia Balanceada:  54.90 % 
Falsos Positivos: 902, Falsos Negativos: 19
Verdadeiros Positivos: 365, Verdadeiros Negativos: 156
Precisao (tp/(tp+fp)): 28.81%
Recall (tp/(tp+fn)): 95.05  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 3

Fitting RBF Network


  y = column_or_1d(y, warn=True)


--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[102 956]
 [ 13 371]]
Acurácia Balanceada:  53.13 % 
Falsos Positivos: 956, Falsos Negativos: 13
Verdadeiros Positivos: 371, Verdadeiros Negativos: 102
Precisao (tp/(tp+fp)): 27.96%
Recall (tp/(tp+fn)): 96.61  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[109 948]
 [ 15 369]]
Acurácia Balanceada:  53.20 % 
Falsos Positivos: 948, Falsos Negativos: 15
Verdadeiros Positivos: 369, Verdadeiros Negativos: 109
Precisao (tp/(tp+fp)): 28.02%
Recall (tp/(tp+fn)): 96.09  %
--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


## Results With Scaling

In [53]:
print("## TREES")
print("")
print("* Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("* Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("* Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))
print("* F1: {:.2f} ".format(np.mean([result.f1 for result in tree_test_results])))
print("")
print("# GLM")
print("")
print("* Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("* Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("* Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))
print("* F1: {:.2f}% ".format(100*np.mean([result.f1 for result in glm_test_results])))
print("")
print("# RBF")
print("")
print("* Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("* Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("* Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))
print("* F1: {:.2f}% ".format(100*np.mean([result.f1 for result in rbf_test_results])))

## TREES

* Balanced Accuraty: 60.46% 
* Precision: 41.52% 
* Recall: 42.87% 
* F1: 0.42 

# GLM

* Balanced Accuraty: nan% 
* Precision: nan% 
* Recall: nan% 
* F1: nan% 

# RBF

* Balanced Accuraty: 53.87% 
* Precision: 28.33% 
* Recall: 96.25% 
* F1: 43.77% 


In [54]:
print("<< TREE >>")
imprime_matriz_de_confusao_media(tree_test_results)
print("<< GLM >>")
imprime_matriz_de_confusao_media(glm_test_results)
print("<< RBF >>")
imprime_matriz_de_confusao_media(rbf_test_results)

<< TREE >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 1985.00
Total Predicted Negatives: 5226.00

True Positives: 824.00
True Negatives: 4128.00
False Positives: 1161.00
False Negatives: 1098.00

Precision: 0.42
Recall: 0.43
Accuracy Positive(Recall): 0.43
Accuracy Negative: 0.78
Balanced Accuracy: 0.60
F1 Score:  0.42

<< GLM >>
Médias
Total Real Positives: 0.00
Total Real Negatives: 0.00
Total Predicted Positives: 0.00
Total Predicted Negatives: 0.00

True Positives: 0.00
True Negatives: 0.00
False Positives: 0.00
False Negatives: 0.00

Precision: nan
Recall: nan
Accuracy Positive(Recall): nan
Accuracy Negative: nan
Balanced Accuracy: nan
F1 Score:  nan

<< RBF >>
Médias
Total Real Positives: 1922.00
Total Real Negatives: 5289.00
Total Predicted Positives: 6531.00
Total Predicted Negatives: 680.00

True Positives: 1850.00
True Negatives: 608.00
False Positives: 4681.00
False Negatives: 72.00

Precision: 0.28
Recall: 0.96
Accuracy Pos

