In [1]:
from RBF import RBFClassifier

# Sugestões
* Replicar o que ele fez usando minha implementação: \
http://mccormickml.com/2013/08/15/radial-basis-function-network-rbfn-tutorial/
* Normalizar os dados antes da clusterização.
* Normalizar os dados transformados depois da clusterização.


# Preparacoes

In [2]:
import pandas as pd;
import numpy as np; 
import sklearn.model_selection;
import sklearn.metrics;
from sklearn.tree import DecisionTreeClassifier; # Tree
import statsmodels.api as sm; # GLM


In [3]:
data = pd.read_csv("data/car_insurance_claim.csv")
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas.")

independent_variables = list(range(1,25))
dependent_variables = 25

numerical_x_train = data.iloc[ : , independent_variables]._get_numeric_data()
numerical_x_train.fillna(numerical_x_train.mean(), inplace = True)

y_train = data.iloc[ : , dependent_variables]

print(f"X: {numerical_x_train.shape[0]} linhas e {numerical_x_train.shape[1]} colunas.")
print(f"Y: {y_train.shape} linhas. Valores Únicos = {y_train.unique()}.")

10302 linhas e 27 colunas.
X: 10302 linhas e 9 colunas.
Y: (10302,) linhas. Valores Únicos = [0 1].


In [4]:
def score_results(y_real, y_predito, label):
    matriz_de_confusao = sklearn.metrics.confusion_matrix(y_true = y_real, y_pred = y_predito)
    tn, fp, fn, tp = matriz_de_confusao.ravel()
    
    print(f"--- {label} ---")
    print("Matrix de ConfusÃ£o")
    print(matriz_de_confusao)
    print("Balanced Accuracy: ", end=" ")
    print(f"{100*sklearn.metrics.balanced_accuracy_score(y_true = y_real, y_pred = y_predito):.2f}%")
    print(f"Falsos Positivos: {fp}, Falsos Negativos: {fn}\n"+
            f"Verdadeiros Positivos: {tp}, Verdadeiros Negativos: {tn}")
    print(f"Precisao (tp/(tp+fp)): {100*tp/(tp+fp) :.2f}%")
    print(f"Recall (tp/(tp+fn)): {100*tp/(tp+fn) :.2f}%")
    
    print("-"*50)


In [5]:
seed = 42
data = pd.read_csv("data/car_insurance_claim.csv")

# Linhas / Colunas
forma = data.shape
print(f"{data.shape[0]} linhas e {data.shape[1]} colunas")
# print(pd.DataFrame(data.columns))
# print(data.describe())
# print(data.describe().shape[1]) # Colunas NÃºmericas

10302 linhas e 27 colunas


# RBF - Iris

In [6]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [7]:
rbf = RBFClassifier(number_of_centers = 6, random_state = 42)

In [8]:
rbf.fit(X,y)
rbf.predict(X)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2,
       2, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
sklearn.metrics.confusion_matrix(y_true = y, y_pred =  rbf.predict(X))

array([[38,  0, 12],
       [ 0, 26, 24],
       [ 0,  5, 45]], dtype=int64)

In [10]:
print("SHAPE: ", rbf._transformed_inputs(X).shape, "\nVALORES: \n")
pd.DataFrame(rbf._transformed_inputs(X))

SHAPE:  (150, 6) 
VALORES: 



Unnamed: 0,0,1,2,3,4,5
0,5.093012e-199,5.449745e-02,6.451152e-28,7.825245e-104,3.463640e-211,1.770657e+00
1,3.357470e-204,2.140315e+00,1.989657e-26,1.904048e-104,4.739463e-215,1.826954e-04
2,1.767789e-216,3.587837e+00,9.620697e-30,6.480938e-114,3.224404e-229,2.609394e-04
3,4.293419e-208,3.587837e+00,1.322720e-26,3.031612e-106,2.175278e-217,1.015419e-05
4,1.963044e-202,4.266805e-02,7.431763e-29,1.884761e-106,8.191883e-215,1.355269e+00
...,...,...,...,...,...,...
145,2.136455e-11,9.344493e-148,3.564441e-15,3.740128e-07,3.023605e-01,2.395721e-157
146,5.975278e-21,2.120152e-121,4.732290e-08,3.294963e-02,2.929409e-06,2.904010e-134
147,3.914926e-13,3.112494e-135,7.935221e-12,1.457011e-03,4.700911e-01,1.804970e-144
148,1.685973e-15,6.440729e-147,1.721996e-15,5.807888e-09,2.993938e-02,7.160280e-157


# RBF - My Dataset

In [11]:
rbf_classifier = RBFClassifier(number_of_centers = 150, random_state = seed)
rbf_classifier.fit(numerical_x_train, y_train)
rbf_pred = rbf_classifier.predict(numerical_x_train)
rbf_pred



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
sum(rbf_pred)

0

In [13]:
print("SHAPE: ", rbf_classifier._transformed_inputs(numerical_x_train).shape, "\nVALORES: \n")
pd.DataFrame(rbf_classifier._transformed_inputs(numerical_x_train))

SHAPE:  (10302, 150) 
VALORES: 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,1.164294e-84,2.024276e-84,2.978218e-23,2.009022e-153,5.486468e-38,4.640195e-148,5.687664e-211,2.855684e-197,5.634469e-32,1.558051e-61,...,2.440383e-149,5.769053e-221,7.015162e-186,4.248375e-25,5.708160e-93,1.359735e-57,3.252024e-07,5.685186e-188,2.096845e-64,1.435947e-35
1,6.229378e-52,1.628398e-41,1.411969e-31,7.405158e-112,2.808280e-48,1.240984e-12,7.790074e-60,2.189214e-98,4.209255e-27,3.096738e-08,...,2.685677e-85,3.765407e-125,1.012178e-144,3.330682e-36,1.578489e-20,9.992225e-38,7.858111e-37,4.498289e-80,3.220940e-25,2.043551e-18
2,2.709445e-69,7.106126e-20,8.500089e-25,8.614067e-84,1.763242e-17,4.263457e-28,9.955566e-52,1.073083e-87,8.743227e-09,1.994634e-15,...,1.056852e-42,2.811478e-83,3.347710e-83,6.002600e-16,5.117834e-20,1.223784e-12,5.949466e-23,7.100911e-65,1.139865e-26,6.552992e-15
3,3.682381e-15,1.707875e-95,1.318292e-40,1.144342e-220,7.901825e-114,6.908921e-95,1.864997e-231,6.170950e-185,6.885264e-46,4.415424e-62,...,1.971830e-246,4.350742e-320,2.737124e-321,1.149822e-91,8.381642e-71,3.565022e-73,4.088502e-46,4.902712e-236,6.283879e-38,5.314697e-22
4,6.107885e-126,1.742153e-23,4.654162e-50,3.359233e-55,7.458454e-14,8.869500e-33,1.283746e-28,5.062690e-73,4.967117e-28,1.777701e-16,...,6.880384e-24,1.273031e-48,8.267847e-56,8.502646e-26,5.682852e-26,1.942189e-26,4.162079e-34,1.374931e-36,1.384001e-28,1.758893e-38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,3.362949e-76,5.292313e-36,4.649527e-36,3.457450e-107,5.037052e-40,4.155508e-72,2.076625e-127,4.535642e-114,9.947774e-18,6.027549e-36,...,1.694680e-104,1.036029e-153,9.656611e-144,3.726142e-46,7.304925e-28,3.230205e-25,8.144031e-18,1.923543e-127,7.099408e-20,5.033876e-24
10298,3.133722e-160,9.711991e-26,5.992850e-79,4.386728e-47,4.981836e-33,1.314217e-19,3.912076e-10,2.964485e-51,2.853720e-48,7.958138e-21,...,4.546594e-26,1.072961e-38,5.003678e-62,3.098888e-56,1.569083e-32,6.453888e-39,8.490266e-62,8.370524e-19,8.066651e-26,4.563405e-57
10299,3.651692e-26,3.477776e-81,1.690379e-17,2.694632e-162,2.042629e-64,2.033597e-59,9.933454e-141,9.934824e-164,9.206673e-37,5.894979e-18,...,2.949193e-155,2.213899e-214,3.119193e-223,1.060564e-33,9.568110e-45,4.839137e-66,6.462892e-21,1.922728e-150,1.418329e-40,5.281190e-18
10300,3.036928e-160,1.153261e-14,1.210904e-66,2.472318e-41,2.754896e-13,8.336627e-46,7.291387e-32,2.264343e-59,1.834562e-30,8.678093e-33,...,2.074524e-12,2.538809e-33,1.860350e-34,1.009080e-40,1.109654e-38,2.389050e-19,6.973661e-46,1.427303e-31,9.836499e-30,2.994202e-50


In [14]:
rbf_classifier._transformed_inputs(numerical_x_train)

array([[1.16429448e-084, 2.02427625e-084, 2.97821830e-023, ...,
        5.68518584e-188, 2.09684481e-064, 1.43594727e-035],
       [6.22937824e-052, 1.62839774e-041, 1.41196868e-031, ...,
        4.49828887e-080, 3.22094013e-025, 2.04355061e-018],
       [2.70944541e-069, 7.10612572e-020, 8.50008923e-025, ...,
        7.10091150e-065, 1.13986531e-026, 6.55299203e-015],
       ...,
       [3.65169158e-026, 3.47777568e-081, 1.69037932e-017, ...,
        1.92272807e-150, 1.41832912e-040, 5.28119039e-018],
       [3.03692824e-160, 1.15326115e-014, 1.21090378e-066, ...,
        1.42730297e-031, 9.83649885e-030, 2.99420226e-050],
       [0.00000000e+000, 1.43232895e-088, 3.63328808e-301, ...,
        7.86763820e-043, 2.86622379e-098, 2.08102125e-236]])