# Prep

## Imports

In [1]:
# Imports
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier; # Tree



# WARNING: This api is different from the SKLEARN 
import statsmodels.api as sm; # GLM


# From Author
from RBF import RBFClassifier
from sklearn.linear_model import LogisticRegression
# Auxiliar
from collections import namedtuple


## Parameters

In [2]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = list(range(1,25))
dependent_variables = 25

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

## Auxiliar Functions

In [3]:
def score_results(y_real, y_predito, label):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    tn, fp, fn, tp = matriz_de_confusao.ravel()
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall"])
    
    output = Resultados(balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito),
                       precision = tp/(tp+fp),
                       recall    = tp/(tp+fn))

    
    print(f"--- {label} ---")
    print("Matrix de Confusão")
    print(matriz_de_confusao)
    print("Acurácia Balanceada: ", end=" ")
    print(f"{100*output.balanced_accuracy :.2f} % ")
    print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
        f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
    print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
    print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

    print("-"*80)
    return output


## Loading Dataset

In [4]:
data = pd.read_csv(data_under_analysis)


In [5]:
# Linhas / Colunas
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
print(pd.DataFrame(data.columns))
print(data.describe())
print(data.describe().shape[1]) # Colunas Númericas


10302 linhas e 27 colunas
             0
0           ID
1     KIDSDRIV
2        BIRTH
3          AGE
4     HOMEKIDS
5          YOJ
6       INCOME
7      PARENT1
8     HOME_VAL
9      MSTATUS
10      GENDER
11   EDUCATION
12  OCCUPATION
13    TRAVTIME
14     CAR_USE
15    BLUEBOOK
16         TIF
17    CAR_TYPE
18     RED_CAR
19    OLDCLAIM
20    CLM_FREQ
21     REVOKED
22     MVR_PTS
23     CLM_AMT
24     CAR_AGE
25  CLAIM_FLAG
26  URBANICITY
                 ID      KIDSDRIV           AGE      HOMEKIDS          YOJ  \
count  1.030200e+04  10302.000000  10295.000000  10302.000000  9754.000000   
mean   4.956631e+08      0.169288     44.837397      0.720443    10.474062   
std    2.864675e+08      0.506512      8.606445      1.116323     4.108943   
min    6.317500e+04      0.000000     16.000000      0.000000     0.000000   
25%    2.442869e+08      0.000000     39.000000      0.000000     9.000000   
50%    4.970043e+08      0.000000     45.000000      0.000000    11.000000   
75%    7

## Separating Data

### X e y

In [6]:
X_all = data.iloc[ : , independent_variables]
y_all = data.iloc[ : ,  dependent_variables ]

### Test and Validation

In [7]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [8]:
# Creating k_folder
stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = CV_SPLITS, random_state = SEED )

# Data Editing

A implementação de árvore do Sklearn só aceita variáveis numéricas. 

Nessa primeira etapa vamos considerar apenas elas, mas podemos tratar as variáveis categóricas no futuro.

In [9]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 24) para (7211, 9)
Redução de 15 variáveis.


In [10]:
# Removing nulls
numerical_X.fillna(numerical_X.mean(), inplace=True)    

# Analysis Code

## Decision Tree

In [11]:
cartTree = DecisionTreeClassifier()

In [12]:
pastas = stratified_k_fold.split(numerical_X, y_train)

In [13]:
tree_train_results = list()
tree_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = numerical_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = numerical_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting Tree")
    cartTree.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = cartTree.predict(X_train_cv)
    tree_train_results.append(score_results(y_train_cv, predicted_train, "CART Tree - Treinamento"))
        
    predicted_test = cartTree.predict(X_test_cv)
    tree_test_results.append(score_results(y_test_cv, predicted_test, "CART Tree - Teste"))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting Tree
--------------------------------------------------
--- CART Tree - Treinamento ---
Matrix de Confusão
[[4231    0]
 [   2 1535]]
Acurácia Balanceada:  99.93 % 
Falsos Positivos: 0, Falsos Negativos: 2
Verdadeiros Positivos: 1535, Verdadeiros Negativos: 4231
Precisao (tp/(tp+fp)): 100.00%
Recall (tp/(tp+fn)): 99.87  %
--------------------------------------------------------------------------------
--- CART Tree - Teste ---
Matrix de Confusão
[[787 271]
 [246 139]]
Acurácia Balanceada:  55.24 % 
Falsos Positivos: 271, Falsos Negativos: 246
Verdadeiros Positivos: 139, Verdadeiros Negativos: 787
Precisao (tp/(tp+fp)): 33.90%
Recall (tp/(tp+fn)): 36.10  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting Tree
----------------------------------

In [14]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))

Test Balanced Accuraty: 99.96% 
Train Balanced Accuraty: 56.54% 

Test Precision: 100.00% 
Train Precision: 35.71% 

Test Recall: 99.92% 
Train Recall: 37.83% 


## GLM


In [15]:
pastas = stratified_k_fold.split(numerical_X, y_train)

In [16]:
glm_train_results = list()
glm_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = numerical_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = numerical_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Creating GLM")
    glm = sm.GLM(exog = X_train_cv, endog = y_train_cv, family = sm.families.Binomial())
    
    print("Fitting GLM")
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)
    
    
    predicted_train_probs = predictor_glm.predict(X_train_cv)
    
    predicted_train = (predicted_train_probs > GLM_TRESHOLD)
    glm_train_results.append(score_results(y_train_cv, predicted_train, "GLM - Treinamento"))
        
    predicted_test_probs = predictor_glm.predict(X_test_cv)
    predicted_test =  (predicted_test_probs > GLM_TRESHOLD )
    glm_test_results.append(score_results(y_test_cv, predicted_test, "GLM - Teste"))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 5768
Model:                            GLM   Df Residuals:                     5759
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3055.6
Date:                Mon, 06 Jan 2020   Deviance:                       6111.1
Time:                        09:46:07   Pearson chi2:                 5.72e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------

--------------------------------------------------
--- GLM - Treinamento ---
Matrix de Confusão
[[4027  204]
 [1294  244]]
Acurácia Balanceada:  55.52 % 
Falsos Positivos: 204, Falsos Negativos: 1294
Verdadeiros Positivos: 244, Verdadeiros Negativos: 4027
Precisao (tp/(tp+fp)): 54.46%
Recall (tp/(tp+fn)): 15.86  %
--------------------------------------------------------------------------------
--- GLM - Teste ---
Matrix de Confusão
[[1024   34]
 [ 322   62]]
Acurácia Balanceada:  56.47 % 
Falsos Positivos: 34, Falsos Negativos: 322
Verdadeiros Positivos: 62, Verdadeiros Negativos: 1024
Precisao (tp/(tp+fp)): 64.58%
Recall (tp/(tp+fn)): 16.15  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

Creating GLM
Fitting GLM
                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:     

In [18]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))

Test Balanced Accuraty: 56.12% 
Train Balanced Accuraty: 56.06% 

Test Precision: 55.33% 
Train Precision: 55.33% 

Test Recall: 17.31% 
Train Recall: 17.27% 


## RBF

In [31]:
import importlib
import RBF

In [36]:

importlib.reload(RBF)

<module 'RBF' from 'C:\\Users\\barban01\\Desktop\\Projetos\\UFABC\\TCC\\projeto_pdg\\RBF.py'>

In [37]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 150, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [38]:
pastas = stratified_k_fold.split(numerical_X, y_train)

In [39]:
rbf_train_results = list()
rbf_test_results = list()
for i, (train_index, test_index) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    
    X_train_cv = numerical_X.iloc[train_index, : ]
    y_train_cv = y_train.iloc[train_index]
    
    X_test_cv = numerical_X.iloc[test_index, : ]
    y_test_cv = y_train.iloc[test_index]
    
    print("Fitting RBF Network")
    rbf_classifier.fit(X_train_cv, y_train_cv)
    print("-"*50)
    
    predicted_train = rbf_classifier.predict(X_train_cv)
    rbf_train_results.append(score_results(y_train_cv, predicted_train, "RBF Network - Treinamento"))
        
    predicted_test = rbf_classifier.predict(X_test_cv)
    rbf_test_results.append(score_results(y_test_cv, predicted_test, "RBF Network - Teste"))

--------------------------------------------------------------------------------
K Fold - Rodada 0

Fitting RBF Network
--------------------------------------------------
--- RBF Network - Treinamento ---
Matrix de Confusão
[[ 856 3375]
 [ 148 1389]]
Acurácia Balanceada:  55.30 % 
Falsos Positivos: 3375, Falsos Negativos: 148
Verdadeiros Positivos: 1389, Verdadeiros Negativos: 856
Precisao (tp/(tp+fp)): 29.16%
Recall (tp/(tp+fn)): 90.37  %
--------------------------------------------------------------------------------
--- RBF Network - Teste ---
Matrix de Confusão
[[159 899]
 [ 39 346]]
Acurácia Balanceada:  52.45 % 
Falsos Positivos: 899, Falsos Negativos: 39
Verdadeiros Positivos: 346, Verdadeiros Negativos: 159
Precisao (tp/(tp+fp)): 27.79%
Recall (tp/(tp+fn)): 89.87  %
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 1

Fitting RBF Network
--------------

In [43]:
print("Train Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_train_results])))
print("Test Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("")
print("Train Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_train_results])))
print("Test Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("")
print("Train Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_train_results])))
print("Test Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))

Test Balanced Accuraty: 54.99% 
Train Balanced Accuraty: 53.02% 

Test Precision: 30.41% 
Train Precision: 28.50% 

Test Recall: 74.25% 
Train Recall: 72.73% 


# Results

In [49]:
print("TREES")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in tree_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in tree_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in tree_test_results])))

print("GLM")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in glm_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in glm_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in glm_test_results])))

print("RBF")
print("Balanced Accuraty: {:.2f}% ".format(100*np.mean([result.balanced_accuracy for result in rbf_test_results])))
print("Precision: {:.2f}% ".format(100*np.mean([result.precision for result in rbf_test_results])))
print("Recall: {:.2f}% ".format(100*np.mean([result.recall for result in rbf_test_results])))

TREES
Balanced Accuraty: 56.54% 
Precision: 35.71% 
Recall: 37.83% 
GLM
Balanced Accuraty: 56.06% 
Precision: 55.33% 
Recall: 17.27% 
RBF
Balanced Accuraty: 53.02% 
Precision: 28.50% 
Recall: 72.73% 
