In [63]:
# Imports
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;
import sklearn.linear_model; # LogisticRegressor
# From Author
import RBF

# Auxiliar
from collections import namedtuple


In [78]:
# Configurando IPython
from IPython.display import display
pd.options.display.max_columns = None

In [2]:
SEED = 42
TRAIN_SIZE = 0.7
data_under_analysis = "data/car_insurance_claim.csv"

independent_variables = ["KIDSDRIV", "BIRTH", 
                            "AGE", "HOMEKIDS", 
                            "YOJ", "INCOME", 
                            "PARENT1", "HOME_VAL", 
                            "MSTATUS", "GENDER", 
                            "EDUCATION", "OCCUPATION", 
                            "TRAVTIME", "CAR_USE", 
                            "BLUEBOOK", "TIF", 
                            "CAR_TYPE", "RED_CAR", 
                            "OLDCLAIM",  "REVOKED", 
                            "MVR_PTS", "CAR_AGE"]

dependent_variable_flag = ["CLAIM_FLAG"]
dependent_variable_frequency = ["CLM_FREQ"]
dependent_variable_value = ["CLM_AMT"]

CV_SPLITS = 5

GLM_TRESHOLD = 0.5

In [3]:
def score_results(y_real, y_predito, label, verbose = False):
    """Função personalizada que mostra os resultados de um classificador.

    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""

    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    try:
        tn, fp, fn, tp = matriz_de_confusao.ravel()
    except ValueError:
        print("ValueError: Imprimindo matriz de confusão.")
        print(matriz_de_confusao)
    
    Resultados = namedtuple("Resultados", ["balanced_accuracy","precision","recall", "f1", "confusion_matrix"])
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito)
    precision = tp/(tp+fp)
    recall    = tp/(tp+fn)
    f1 = 2*(precision * recall) / (precision + recall)
    output = Resultados(balanced_accuracy = balanced_accuracy,
                       precision = precision,
                       recall    = recall,
                       f1 =  f1,
                        
                       confusion_matrix = matriz_de_confusao)

    if verbose:
        print(f"--- {label} ---")
        print("Matrix de Confusão")
        print(matriz_de_confusao)
        print("Acurácia Balanceada: ", end=" ")
        print(f"{100*output.balanced_accuracy :.2f} % ")
        print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
        print(f"Precisao (tp/(tp+fp)): {100* output.precision :.2f}%")
        print(f"Recall (tp/(tp+fn)): {100* output.recall :.2f}  %")

        print("-"*80)
    return output


In [4]:
def imprime_matriz_de_confusao_media(test_results):
    tns, tps, fps, fns = list(), list(), list(), list()
    for index, result in enumerate(test_results):
        confusion_matrix = result.confusion_matrix
        #print(f"Fit {index}")
        #print(confusion_matrix)
        tnl, fpl, fnl, tpl = confusion_matrix.ravel()
        tns.append(tnl)
        fps.append(fpl)
        fns.append(fnl)
        tps.append(tpl)

    tn = np.sum(tns)
    tp = np.sum(tps)
    fp = np.sum(fps)
    fn = np.sum(fns)
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall) / (precision+recall)
    print("Médias")
    print(f"Total Real Positives: {tp+fn :.2f}\nTotal Real Negatives: {tn+fp:.2f}\n" +
          f"Total Predicted Positives: {tp+fp:.2f}\nTotal Predicted Negatives: {tn+fn:.2f}\n" +
          "\n"
          f"True Positives: {tp:.2f}\nTrue Negatives: {tn:.2f}\nFalse Positives: {fp:.2f}\nFalse Negatives: {fn:.2f}")
    print("")
    print(f"Precision: { precision :.2f}\nRecall: { recall :.2f}")
    print(f"Accuracy Positive(Recall): {tp/(tp+fn):.2f}\nAccuracy Negative: {tn/(tn+fp):.2f}")
    print(f"Balanced Accuracy: {((tp/(tp+fn)) + (tn/(tn+fp)))/2 :.2f}")
    print(f"F1 Score: {f1 : .2f}")
    print("")
 



In [5]:
data = pd.read_csv(data_under_analysis)
X_all = data.loc[ : , independent_variables]
y_all = data.loc[ : , dependent_variable_flag]

In [6]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_all, y_all, train_size = TRAIN_SIZE, random_state = SEED, stratify = y_all )

In [7]:
numerical_X = X_train._get_numeric_data()
print(f"Selecionando apenas variáveis numéricas.\nFormato de X de {X_train.shape} para {numerical_X.shape}")
print(f"Redução de {X_train.shape[1] - numerical_X.shape[1]} variáveis.")

Selecionando apenas variáveis numéricas.
Formato de X de (7211, 22) para (7211, 8)
Redução de 14 variáveis.


In [16]:
money_variables = ["INCOME", "BLUEBOOK", "HOME_VAL", "OLDCLAIM"]
money_X =  X_train[money_variables].replace('[\$,]', '', regex=True, inplace = False).astype(float)
money_X.fillna(money_X.mean(), inplace = True)

# Testando as variáveis do dinheiro numa regressão logística.

In [31]:
lr = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs")
lr.fit(money_X, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
predictions = lr.predict(money_X)
balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_train, y_pred = predictions)
recall = sklearn.metrics.recall_score(y_true = y_train, y_pred = predictions)
precision = sklearn.metrics.precision_score(y_true = y_train, y_pred = predictions)

print("Balanced Accuray: {:.2f}%\nRecall: {:.2f}%\nPrecision: {:.2f}%".format(100*balanced_accuracy, 100*recall, 100*precision))

Balanced Accuray: 60.58%
Recall: 47.71%
Precision: 39.51%


# Testando as variáveis do dinheiro e numéricas juntas, numa regressão logística

In [32]:
# Replacing NULLS with the mean
numerical_X.fillna(numerical_X.mean(), inplace=True)    
money_X.fillna(money_X.mean(), inplace = True)

In [36]:
features_X = numerical_X.join(money_X, on = None)

In [42]:
print(numerical_X.shape)
print(money_X.shape)
print(features_X.shape)
print(y_train.shape)

(7211, 8)
(7211, 4)
(7211, 12)
(7211, 1)


In [43]:
lr = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs")
lr.fit(features_X, y_train)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
predictions = lr.predict(features_X)

In [46]:
balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_train, y_pred = predictions)
recall = sklearn.metrics.recall_score(y_true = y_train, y_pred = predictions)
precision = sklearn.metrics.precision_score(y_true = y_train, y_pred = predictions)

print("Balanced Accuray: {:.2f}%\nRecall: {:.2f}%\nPrecision: {:.2f}%".format(100*balanced_accuracy, 100*recall, 100*precision))

Balanced Accuray: 61.28%
Recall: 60.35%
Precision: 36.72%


In [57]:
def testaModelo(algoritmo, variaveis_X, y):
    algoritmo.fit(variaveis_X, y.values.ravel())
    predictions = algoritmo.predict(variaveis_X)
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_train, y_pred = predictions)
    recall = sklearn.metrics.recall_score(y_true = y_train, y_pred = predictions)
    precision = sklearn.metrics.precision_score(y_true = y_train, y_pred = predictions)

    print("Balanced Accuray: {:.2f}%\nRecall: {:.2f}%\nPrecision: {:.2f}%".format(100*balanced_accuracy, 100*recall, 100*precision))
    

In [61]:
lr = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs")
print("\n>> Money")
testaModelo(lr, money_X, y_train)
print("\n>> Numerical")
testaModelo(lr, numerical_X, y_train)
print("\n>> Features")
testaModelo(lr, features_X, y_train)



>> Money
Balanced Accuray: 60.58%
Recall: 47.71%
Precision: 39.51%

>> Numerical
Balanced Accuray: 62.33%
Recall: 57.44%
Precision: 38.90%

>> Features
Balanced Accuray: 61.28%
Recall: 60.35%
Precision: 36.72%




In [65]:
rbf_classifier = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs"))

In [66]:
print("\n>> Money")
testaModelo(rbf_classifier, money_X, y_train)
print("\n>> Numerical")
testaModelo(rbf_classifier, numerical_X, y_train)
print("\n>> Features")
testaModelo(rbf_classifier, features_X, y_train)



>> Money


  'precision', 'predicted', average, warn_for)


Balanced Accuray: 50.00%
Recall: 0.00%
Precision: 0.00%

>> Numerical
Balanced Accuray: 54.74%
Recall: 88.03%
Precision: 28.94%

>> Features
Balanced Accuray: 50.00%
Recall: 0.00%
Precision: 0.00%


  'precision', 'predicted', average, warn_for)


Podemos perceber que a RBF está com problemas. Vamos usá-la só para transformar os dados e ver o que está acontecendo.

In [81]:
rbf_classifierNum = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs"))
rbf_classifierNum.fit(numerical_X, y_train)
rbf_numerical_X = rbf_classifierNum._transformed_inputs(numerical_X)

  y = column_or_1d(y, warn=True)


In [93]:
rbf_classifierMon = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs"))
rbf_classifierMon.fit(money_X, y_train.values.ravel())
rbf_money_X = rbf_classifierMon._transformed_inputs(money_X)

In [94]:
rbf_classifierFeat = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs"))
rbf_classifierFeat.fit(features_X,  y_train.values.ravel())
rbf_features_X = rbf_classifierFeat._transformed_inputs(features_X)

In [110]:
print(f"Numerical: \t{rbf_numerical_X.max():.2e}")
print(f"Money:   \t{rbf_money_X.max() :.2e}")
print(f"Features: \t{rbf_features_X.max():.2e}")

Numerical: 	4.33e-02
Money:   	1.10e-08
Features: 	1.38e-08


Os máximos estão muito baixos. Money e Features podem estar sendo considerados Zeros. Vamos tentar aumentar o valor das variáveis e testar os modelos.

In [114]:
lr = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs")
print("\n>> Numerical")
testaModelo(lr, rbf_numerical_X, y_train)
print("\n>> Money")
testaModelo(lr, 10e10*rbf_money_X, y_train)
print("\n>> Features")
testaModelo(lr, 10e10*rbf_features_X, y_train)


>> Numerical
Balanced Accuray: 54.74%
Recall: 88.03%
Precision: 28.94%

>> Money
Balanced Accuray: 57.99%
Recall: 78.41%
Precision: 31.34%

>> Features
Balanced Accuray: 57.63%
Recall: 77.78%
Precision: 31.13%




Parece que funcionou. 

Então precisamos aumentar o valor após a transformação RBF. Talvez se normalizarmos os dados, possamos corrigir esse problema.

In [137]:
def testaModeloComNormalizacao(algoritmo, variaveis_X, y):
    normalized_X = sklearn.preprocessing.normalize(money_X, norm='l2', axis=1, copy=True, return_norm=False)
    algoritmo.fit(normalized_X, y.values.ravel())
    predictions = algoritmo.predict(normalized_X)
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_true = y_train, y_pred = predictions)
    recall = sklearn.metrics.recall_score(y_true = y_train, y_pred = predictions)
    precision = sklearn.metrics.precision_score(y_true = y_train, y_pred = predictions)

    print("Balanced Accuray: {:.2f}%\nRecall: {:.2f}%\nPrecision: {:.2f}%".format(100*balanced_accuracy, 100*recall, 100*precision))
    

In [138]:
rbf = RBF.RBFClassifier(number_of_centers = 50, 
                               random_state = SEED, 
                               algorithm = sklearn.linear_model.LogisticRegression(class_weight="balanced", solver = "lbfgs"))

print("\n>> Numerical Sem Normalizacao")
testaModelo(rbf, numerical_X, y_train)
print("\n>> Numerical Com Normalizacao")
testaModeloComNormalizacao(rbf, numerical_X, y_train)
print("\n>> Money")
testaModeloComNormalizacao(rbf, money_X, y_train)
print("\n>> Features")
testaModeloComNormalizacao(rbf, features_X, y_train)


>> Numerical Sem Normalizacao
Balanced Accuray: 54.74%
Recall: 88.03%
Precision: 28.94%

>> Numerical Com Normalizacao




Balanced Accuray: 59.57%
Recall: 66.23%
Precision: 33.82%

>> Money




Balanced Accuray: 59.57%
Recall: 66.23%
Precision: 33.82%

>> Features
Balanced Accuray: 59.57%
Recall: 66.23%
Precision: 33.82%


