In [49]:
from RBF import RBFClassifier

# Sugestões
* Replicar o que ele fez usando minha implementação: \
http://mccormickml.com/2013/08/15/radial-basis-function-network-rbfn-tutorial/
* Normalizar os dados antes da clusterização.
* Normalizar os dados transformados depois da clusterização.


# Preparacoes

In [44]:
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;
from sklearn.tree import DecisionTreeClassifier; # Tree
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm; # GLM


In [9]:
data = pd.read_csv("data/car_insurance_claim.csv")
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas.")

independent_variables = list(range(1,25))
dependent_variables = 25

numerical_x_train = data.iloc[ : , independent_variables]._get_numeric_data()
numerical_x_train.fillna(numerical_x_train.mean(), inplace = True)

y_train = data.iloc[ : , dependent_variables]

print(f"X: {numerical_x_train.shape[0]} linhas e {numerical_x_train.shape[1]} colunas.")
print(f"Y: {y_train.shape} linhas. Valores Únicos = {y_train.unique()}.")

10302 linhas e 27 colunas.
X: 10302 linhas e 9 colunas.
Y: (10302,) linhas. Valores Únicos = [0 1].


In [53]:
def score_results(y_real, y_predito, label):
    """Função personalizada que scora os resultados.
    
    Arguments:
        y_real -- O vetor com os dados esperados
        y_predito -- O vetor com as predições
        label -- Um título para essa análise, pode ser usado para facilitar a identificação do output."""
    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    tn, fp, fn, tp = matriz_de_confusao.ravel()
    
    print(f"--- {label} ---")
    print("Matrix de ConfusÃ£o")
    print(matriz_de_confusao)
    print("Balanced Accuracy: ", end=" ")
    print(f"{100*sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito):.2f}%")
    print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
    print(f"Precisao (tp/(tp+fp)): {100*tp/(tp+fp) :.2f}%")
    print(f"Recall (tp/(tp+fn)): {100*tp/(tp+fn) :.2f}%")
    
    print("-"*50)


In [11]:
seed = 42
data = pd.read_csv("data/car_insurance_claim.csv")

# Linhas / Colunas
forma = data.shape
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
# print(pd.DataFrame(data.columns))
# print(data.describe())
# print(data.describe().shape[1]) # Colunas NÃºmericas

10302 linhas e 27 colunas


# RBF - Iris

In [6]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [7]:
rbf = RBFClassifier(number_of_centers = 6, random_state = 42)

In [8]:
rbf.fit(X,y)
rbf.predict(X)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2,
       2, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
sklearn.metrics.confusion_matrix(y_true = y, y_pred =  rbf.predict(X))

array([[38,  0, 12],
       [ 0, 26, 24],
       [ 0,  5, 45]], dtype=int64)

In [10]:
print("SHAPE: ", rbf._transformed_inputs(X).shape, "\nVALORES: \n")
pd.DataFrame(rbf._transformed_inputs(X))

SHAPE:  (150, 6) 
VALORES: 



Unnamed: 0,0,1,2,3,4,5
0,5.093012e-199,5.449745e-02,6.451152e-28,7.825245e-104,3.463640e-211,1.770657e+00
1,3.357470e-204,2.140315e+00,1.989657e-26,1.904048e-104,4.739463e-215,1.826954e-04
2,1.767789e-216,3.587837e+00,9.620697e-30,6.480938e-114,3.224404e-229,2.609394e-04
3,4.293419e-208,3.587837e+00,1.322720e-26,3.031612e-106,2.175278e-217,1.015419e-05
4,1.963044e-202,4.266805e-02,7.431763e-29,1.884761e-106,8.191883e-215,1.355269e+00
...,...,...,...,...,...,...
145,2.136455e-11,9.344493e-148,3.564441e-15,3.740128e-07,3.023605e-01,2.395721e-157
146,5.975278e-21,2.120152e-121,4.732290e-08,3.294963e-02,2.929409e-06,2.904010e-134
147,3.914926e-13,3.112494e-135,7.935221e-12,1.457011e-03,4.700911e-01,1.804970e-144
148,1.685973e-15,6.440729e-147,1.721996e-15,5.807888e-09,2.993938e-02,7.160280e-157


# RBF - My Dataset

In [45]:
rbf_classifier = RBFClassifier(number_of_centers = 150, 
                               random_state = seed, 
                               algorithm = LogisticRegression(class_weight="balanced"))
rbf_classifier.fit(numerical_x_train, y_train)
rbf_pred = rbf_classifier.predict(numerical_x_train)
rbf_pred



array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

In [52]:
?score_results()

Object `score_results()` not found.


In [34]:
print("SHAPE: ", rbf_classifier._transformed_inputs(numerical_x_train).shape, "\nVALORES: \n")
transformed_X = pd.DataFrame(rbf_classifier._transformed_inputs(numerical_x_train))

SHAPE:  (10302, 150) 
VALORES: 



In [35]:
rbf_classifier.algorithm.score(transformed_X, y_train)

0.7334498155697923

In [64]:
import sklearn.dummy
balanced_dummy = sklearn.dummy.DummyClassifier(strategy = "stratified").fit(numerical_x_train, y_train)
most_frequent_dummy = sklearn.dummy.DummyClassifier(strategy = "most_frequent").fit(numerical_x_train, y_train)


In [67]:
balanced_predictions = balanced_dummy.predict(numerical_x_train)
most_frequent_predictions = most_frequent_dummy.predict(numerical_x_train)


In [69]:
score_results(y_train, rbf_pred, "Árvore RBF")
score_results(y_train, balanced_predictions, "Dummy - Stratified")
score_results(y_train, most_frequent_predictions, "Dummy - most_frequent")


--- Árvore RBF ---
Matrix de ConfusÃ£o
[[2066 5490]
 [ 416 2330]]
Balanced Accuracy:  56.10%
Falsos Positivos: 5490, Falsos Negativos: 416
Verdadeiros Positivos: 2330, Verdadeiros Negativos: 2066
Precisao (tp/(tp+fp)): 29.80%
Recall (tp/(tp+fn)): 84.85%
--------------------------------------------------
--- Dummy - Stratified ---
Matrix de ConfusÃ£o
[[5586 1970]
 [1990  756]]
Balanced Accuracy:  50.73%
Falsos Positivos: 1970, Falsos Negativos: 1990
Verdadeiros Positivos: 756, Verdadeiros Negativos: 5586
Precisao (tp/(tp+fp)): 27.73%
Recall (tp/(tp+fn)): 27.53%
--------------------------------------------------
--- Dummy - most_frequent ---
Matrix de ConfusÃ£o
[[7556    0]
 [2746    0]]
Balanced Accuracy:  50.00%
Falsos Positivos: 0, Falsos Negativos: 2746
Verdadeiros Positivos: 0, Verdadeiros Negativos: 7556
Precisao (tp/(tp+fp)): nan%
Recall (tp/(tp+fn)): 0.00%
--------------------------------------------------


