#### Importing and preprocessing the data

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pyfume.Clustering import Clusterer
from pyfume.EstimateAntecendentSet import AntecedentEstimator
from pyfume.EstimateConsequentParameters import ConsequentEstimator
from pyfume.SimpfulModelBuilder import SugenoFISBuilder
from pyfume.Tester import SugenoFISTester
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from numpy import clip, column_stack, argmax

data = pd.read_csv('wbco.csv')
indexes = ['LThick', 'UCellSize', 'UCellShape', 'MAdhesion', 'SECS', 'BlandC', 'NormNuc', 'Mitoses', 'Unknown', 'Target']
data.columns = indexes

indexes = indexes[:-1]

In [25]:
data.head()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,BlandC,NormNuc,Mitoses,Unknown,Target
0,5,4,4,5,7,10,3,2,1,0
1,3,1,1,1,2,2,3,1,1,0
2,6,8,8,1,3,4,3,7,1,0
3,4,1,1,3,2,1,3,1,1,0
4,8,10,10,8,7,10,9,7,1,1


In [26]:
data.describe()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,NormNuc,Mitoses,Unknown,Target
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,4.416905,3.137536,3.210602,2.809456,3.217765,3.438395,2.869628,1.590258,0.345272
std,2.817673,3.052575,2.972867,2.856606,2.215408,2.440056,3.055004,1.716162,0.475798
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [27]:
data.isnull().sum()

LThick        0
UCellSize     0
UCellShape    0
MAdhesion     0
SECS          0
BlandC        0
NormNuc       0
Mitoses       0
Unknown       0
Target        0
dtype: int64

In [28]:
data.dtypes == int

LThick         True
UCellSize      True
UCellShape     True
MAdhesion      True
SECS           True
BlandC        False
NormNuc        True
Mitoses        True
Unknown        True
Target         True
dtype: bool

A variável 'BlandC' está atualmente definida como string pois possui algumas entradas com '?'. Como resolver este problema? O número de entradas sem valor é significativo?

In [29]:
na_perc = (data.BlandC[data.BlandC == '?']).count() / data.BlandC.count() * 100
print(str(na_perc) + '% percent of entries are null.')

2.292263610315186% percent of entries are null.


Selected aproach: fill missing values with the column's average.

In [30]:
avg = data.BlandC[data.BlandC != '?'].astype(float).mean()
data.loc[data.BlandC == '?', 'BlandC'] = avg
data.BlandC = data.BlandC.astype(int)

Creating a train-test split

In [31]:
data_x = data.drop('Target', axis = 1)
data_y = data.Target

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state=21)

Formating the data to numpy

In [32]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.array
y_test = y_test.array

#### Building the model

In [33]:
cl = Clusterer(x_train=x_train, y_train=y_train, nr_clus=4)
clust_centers, part_matrix, _ = cl.cluster(method='fcm')

ae = AntecedentEstimator(x_train, part_matrix)
antecedent_params = ae.determineMF()

ce = ConsequentEstimator(x_train, y_train, part_matrix)
conseq_params = ce.suglms()

modbuilder = SugenoFISBuilder(antecedent_params, conseq_params, indexes, save_simpful_code=False)
model = modbuilder.get_model()

modtester = SugenoFISTester(model, x_test, indexes)
y_pred_probs = clip(modtester.predict()[0], 0, 1)
y_pred_probs = column_stack((1 - y_pred_probs, y_pred_probs))
y_pred = argmax(y_pred_probs,axis=1)

acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))


 * Detected 4 rules / clusters
 * Detected Sugeno model type
Accuracy: 0.971
Recall: 0.976
Precision Score: 0.930
F1-Score: 0.952
Kappa Score: 0.932
