In [1]:
# Imports
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;

# OBS: Scikit-Learn
# scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
# https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier; # Tree



# WARNING: This api is different from the SKLEARN 
import statsmodels.api as sm; # GLM


# From Author
from RBF import RBFClassifier
from sklearn.linear_model import LogisticRegression
# Auxiliar
from collections import namedtuple


In [2]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = ["KIDSDRIV", "BIRTH", 
                            "AGE", "HOMEKIDS", 
                            "YOJ", "INCOME", 
                            "PARENT1", "HOME_VAL", 
                            "MSTATUS", "GENDER", 
                            "EDUCATION", "OCCUPATION", 
                            "TRAVTIME", "CAR_USE", 
                            "BLUEBOOK", "TIF", 
                            "CAR_TYPE", "RED_CAR", 
                            "OLDCLAIM",  "REVOKED", 
                            "MVR_PTS", "CAR_AGE"]

dependent_variable_flag = ["CLAIM_FLAG"]
dependent_variable_frequency = ["CLM_FREQ"]
dependent_variable_value = ["CLM_AMT"]

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

In [3]:
def score_results(y_real, y_predito, label, verbose = False):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    try:
        tn, fp, fn, tp = matriz_de_confusao.ravel()
    except ValueError:
        print("ValueError: Imprimindo matriz de confusão.")
        print(matriz_de_confusao)
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall", "f1", "confusion_matrix"])
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito)
    precision = tp/(tp+fp)
    recall    = tp/(tp+fn)
    f1 = 2*(precision * recall) / (precision + recall)
    output = Resultados(balanced_accuracy = balanced_accuracy,
                       precision = precision,
                       recall    = recall,
                       f1 =  f1,
                        
                       confusion_matrix = matriz_de_confusao)

    if verbose:
        print(f"--- {label} ---")
        print("Matrix de Confusão")
        print(matriz_de_confusao)
        print("Acurácia Balanceada: ", end=" ")
        print(f"{100*output.balanced_accuracy :.2f} % ")
        print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
        print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
        print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

        print("-"*80)
    return output


In [4]:
def imprime_matriz_de_confusao_media(test_results):
    tns, tps, fps, fns = list(), list(), list(), list()
    for index, result in enumerate(test_results):
        confusion_matrix = result.confusion_matrix
        #print(f"Fit {index}")
        #print(confusion_matrix)
        tnl, fpl, fnl, tpl = confusion_matrix.ravel()
        tns.append(tnl)
        fps.append(fpl)
        fns.append(fnl)
        tps.append(tpl)

    tn = np.sum(tns)
    tp = np.sum(tps)
    fp = np.sum(fps)
    fn = np.sum(fns)
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall) / (precision+recall)
    print("Médias")
    print(f"Total Real Positives: {tp+fn :.2f}\nTotal Real Negatives: {tn+fp:.2f}\n" +
          f"Total Predicted Positives: {tp+fp:.2f}\nTotal Predicted Negatives: {tn+fn:.2f}\n" +
          "\n"
          f"True Positives: {tp:.2f}\nTrue Negatives: {tn:.2f}\nFalse Positives: {fp:.2f}\nFalse Negatives: {fn:.2f}")
    print("")
    print(f"Precision: { precision :.2f}\nRecall: { recall :.2f}")
    print(f"Accuracy Positive(Recall): {tp/(tp+fn):.2f}\nAccuracy Negative: {tn/(tn+fp):.2f}")
    print(f"Balanced Accuracy: {((tp/(tp+fn)) + (tn/(tn+fp)))/2 :.2f}")
    print(f"F1 Score: {f1 : .2f}")
    print("")
 



In [6]:
data = pd.read_csv(data_under_analysis)
X_all = data.loc[ : , independent_variables]
y_all = data.loc[ : , dependent_variable_flag]

In [7]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [8]:
# Creating k_folder
stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_splits = CV_SPLITS, random_state = SEED )

In [9]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 22) para (7211, 8)
Redução de 14 variáveis.


In [13]:
money_variables = ["INCOME", "BLUEBOOK", "HOME_VAL", "OLDCLAIM"]
money_X =  X_train[money_variables].replace('[\$,]', '', regex=True, inplace = False).astype(float)
money_X

Unnamed: 0,INCOME,BLUEBOOK,HOME_VAL,OLDCLAIM
878,73663.0,41310.0,268990.0,36700.0
5358,34669.0,8630.0,146134.0,2499.0
1433,23427.0,6640.0,98357.0,17308.0
2769,54593.0,11260.0,221267.0,1449.0
3326,55770.0,12380.0,163735.0,0.0
...,...,...,...,...
152,104990.0,16080.0,306540.0,0.0
6870,152283.0,16650.0,441904.0,0.0
9663,8585.0,13540.0,0.0,4451.0
1431,,19600.0,240520.0,3220.0


In [26]:
money_X.isna().sum()

INCOME      388
BLUEBOOK      0
HOME_VAL    409
OLDCLAIM      0
dtype: int64

In [14]:
import importlib
import RBF
importlib.reload(RBF)

<module 'RBF' from 'C:\\Users\\barban01\\Desktop\\Projetos\\UFABC\\TCC\\projeto_pdg\\RBF.py'>

In [None]:
# Replacing NULLS with the mean
numerical_X.fillna(numerical_X.mean(), inplace=True)    
money_X.fillna(money_X.mean(), inplace = True)

Variáveis BOAS(Sozinhas): 
INCOME
BLUEBOOK
HOME_VAL
OLDCLAIM
Variaveis RUINS: 


In [65]:
teste = money_X[["BLUEBOOK", "OLDCLAIM"]]
teste = money_X["BLUEBOOK"]

In [66]:
features_X = numerical_X.join(teste, on = None)
#features_X = teste
features_X.fillna(features_X.mean(), inplace = True)

In [67]:
print(features_X.isna().sum())
print(features_X.dtypes)
features_X.fillna(features_X.mean, inplace = True)


KIDSDRIV    0
AGE         0
HOMEKIDS    0
YOJ         0
TRAVTIME    0
TIF         0
MVR_PTS     0
CAR_AGE     0
BLUEBOOK    0
dtype: int64
KIDSDRIV      int64
AGE         float64
HOMEKIDS      int64
YOJ         float64
TRAVTIME      int64
TIF           int64
MVR_PTS       int64
CAR_AGE     float64
BLUEBOOK    float64
dtype: object


In [68]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [69]:
print("Fitting RBF Network")
rbf_classifier.fit(features_X, y_train)

Fitting RBF Network


  y = column_or_1d(y, warn=True)


In [70]:
predicted_test = rbf_classifier.predict(features_X)
resultados = score_results(y_train, predicted_test, "RBF Network - Teste", verbose = True)

--- RBF Network - Teste ---
Matrix de Confusão
[[2947 2342]
 [ 858 1064]]
Acurácia Balanceada:  55.54 % 
Falsos Positivos: 2342, Falsos Negativos: 858
Verdadeiros Positivos: 1064, Verdadeiros Negativos: 2947
Precisao (tp/(tp+fp)): 31.24%
Recall (tp/(tp+fn)): 55.36  %
--------------------------------------------------------------------------------
