# Projeto PDG

In [15]:
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;


from sklearn.tree import DecisionTreeClassifier; # Tree

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html



import statsmodels.api as sm; # GLM

In [4]:
seed = 42

In [37]:
data = pd.read_csv("data/car_insurance_claim.csv")

In [17]:
# Linhas / Colunas
forma = data.shape
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
print(pd.DataFrame(data.columns))
print(data.describe())
print(data.describe().shape[1]) # Colunas Númericas

10302 linhas e 27 colunas
             0
0           ID
1     KIDSDRIV
2        BIRTH
3          AGE
4     HOMEKIDS
5          YOJ
6       INCOME
7      PARENT1
8     HOME_VAL
9      MSTATUS
10      GENDER
11   EDUCATION
12  OCCUPATION
13    TRAVTIME
14     CAR_USE
15    BLUEBOOK
16         TIF
17    CAR_TYPE
18     RED_CAR
19    OLDCLAIM
20    CLM_FREQ
21     REVOKED
22     MVR_PTS
23     CLM_AMT
24     CAR_AGE
25  CLAIM_FLAG
26  URBANICITY
                 ID      KIDSDRIV           AGE      HOMEKIDS          YOJ  \
count  1.030200e+04  10302.000000  10295.000000  10302.000000  9754.000000   
mean   4.956631e+08      0.169288     44.837397      0.720443    10.474062   
std    2.864675e+08      0.506512      8.606445      1.116323     4.108943   
min    6.317500e+04      0.000000     16.000000      0.000000     0.000000   
25%    2.442869e+08      0.000000     39.000000      0.000000     9.000000   
50%    4.970043e+08      0.000000     45.000000      0.000000    11.000000   
75%    7

In [114]:
def score_results(y_real, y_predito, label):
    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    tn, fp, fn, tp = matriz_de_confusao.ravel()
    
    print(f"--- {label} ---")
    print("Matrix de Confusão")
    print(matriz_de_confusao)
    print("Balanced Accuracy: ", end=" ")
    print(f"{100*sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito):.2f}%")
    print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
    print(f"Precisao (tp/(tp+fp)): {100*tp/(tp+fp) :.2f}%")
    print(f"Recall (tp/(tp+fn)): {100*tp/(tp+fn) :.2f}%")
    
    print("-"*50)

In [125]:
## Decision Tree
independent_variables = list(range(1,25))
dependent_variables = 25
cartTree = DecisionTreeClassifier()

X_data = data.iloc[ : , independent_variables]
y_data = data.iloc[ : , dependent_variables]

stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = 5, random_state = seed )

pastas = stratified_k_fold.split(X_data, y_data)

for i, (train, test) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    X_train = X_data.iloc[train, : ]
    y_train = y_data.iloc[train]

    X_test = X_data.iloc[test, : ]
    y_test = y_data.iloc[test]
    
    # Improve this part
    numerical_x_train = X_train._get_numeric_data()
    numerical_x_train.fillna(numerical_x_train.mean(), inplace=True)
    
    fit_result = cartTree.fit(numerical_x_train, y_train)
    print(fit_result)
    print("-"*50)
    
    predicted_train = cartTree.predict(numerical_x_train)
    score_results(y_train, predicted_train, "CART Tree - Treinamento")
    
    # Improve this part
    numerical_x_test = X_test._get_numeric_data()
    numerical_x_test.fillna(numerical_x_test.mean(), inplace=True)
    
    predicted_test = cartTree.predict(numerical_x_test)
    score_results(y_test, predicted_test, "CART Tree - Teste")


--------------------------------------------------------------------------------
K Fold - Rodada 0

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
--------------------------------------------------
--- CART Tree - Treinamento ---
Matrix de Confusão
[[6044    0]
 [   2 2194]]
Balanced Accuracy:  99.95%
Falsos Positivos: 0, Falsos Negativos: 2
Verdadeiros Positivos: 2194, Verdadeiros Negativos: 6044
Precisao (tp/(tp+fp)): 100.00%
Recall (tp/(tp+fn)): 99.91%
--------------------------------------------------
--- CART Tree - Teste ---
Matrix de Confusão
[[1129  383]
 [ 352  198]]
Balanced Accuracy:  55.33%
Falsos Positivos: 383, Falsos Ne

In [146]:
## GLM
independent_variables = list(range(1,25))
dependent_variables = 25

X_data = data.iloc[ : , independent_variables]
y_data = data.iloc[ : , dependent_variables]

stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = 5, random_state = seed )

pastas = stratified_k_fold.split(X_data, y_data)

for i, (train, test) in enumerate(pastas):
    print("-"*80)
    print(f"K Fold - Rodada {i}\n")
    X_train = X_data.iloc[train, : ]
    y_train = y_data.iloc[train]

    X_test = X_data.iloc[test, : ]
    y_test = y_data.iloc[test]
    
    # Improve this part
    numerical_x_train = X_train._get_numeric_data()
    numerical_x_train.fillna(numerical_x_train.mean(), inplace=True)    
    #################################################
    
    glm = sm.GLM(exog = numerical_x_train, endog =  y_train, family=sm.families.Binomial())
    
    predictor_glm = glm.fit()
    print(predictor_glm.summary())
    print("-"*50)

 
    treshold = 0.5
    predicted_train_probs = predictor_glm.predict(numerical_x_train)
    
    predicted_train = (predicted_train_probs > treshold)
    score_results(y_train, predicted_train, "GLM - Treinamento")

    # Improve this part
    numerical_x_test = X_test._get_numeric_data()
    numerical_x_test.fillna(numerical_x_test.mean(), inplace=True)
    ######################################################
    predicted_test_probs = predictor_glm.predict(numerical_x_test)
    predicted_test =  (predicted_test_probs > treshold )
    
    score_results(y_test, predicted_test, "GLM - Teste")
    





--------------------------------------------------------------------------------
K Fold - Rodada 0

                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 8240
Model:                            GLM   Df Residuals:                     8231
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -4351.4
Date:                Thu, 21 Nov 2019   Deviance:                       8702.8
Time:                        09:33:30   Pearson chi2:                 8.18e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------

--------------------------------------------------
--- GLM - Treinamento ---
Matrix de Confusão
[[5754  291]
 [1818  379]]
Balanced Accuracy:  56.22%
Falsos Positivos: 291, Falsos Negativos: 1818
Verdadeiros Positivos: 379, Verdadeiros Negativos: 5754
Precisao (tp/(tp+fp)): 56.57%
Recall (tp/(tp+fn)): 17.25%
--------------------------------------------------
--- GLM - Teste ---
Matrix de Confusão
[[1447   64]
 [ 454   95]]
Balanced Accuracy:  56.53%
Falsos Positivos: 64, Falsos Negativos: 454
Verdadeiros Positivos: 95, Verdadeiros Negativos: 1447
Precisao (tp/(tp+fp)): 59.75%
Recall (tp/(tp+fn)): 17.30%
--------------------------------------------------
--------------------------------------------------------------------------------
K Fold - Rodada 4

                 Generalized Linear Model Regression Results                  
Dep. Variable:             CLAIM_FLAG   No. Observations:                 8242
Model:                            GLM   Df Residuals:                     8233
M

In [None]:
# RBFN
## Setup

## Train
## Test


# Compare results


In [134]:
?sm.GLM