#### Importing and preprocessing the data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pyfume.Clustering import Clusterer
from pyfume.EstimateAntecendentSet import AntecedentEstimator
from pyfume.EstimateConsequentParameters import ConsequentEstimator
from pyfume.SimpfulModelBuilder import SugenoFISBuilder
from pyfume.Tester import SugenoFISTester
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from numpy import clip, column_stack, argmax

data = pd.read_csv('wbco.csv', header=None)
indexes = ['LThick', 'UCellSize', 'UCellShape', 'MAdhesion', 'SECS', 'BlandC', 'NormNuc', 'Mitoses', 'Unknown', 'Target']
data.columns = indexes

indexes = indexes[:-1]

In [2]:
data.head()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,BlandC,NormNuc,Mitoses,Unknown,Target
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [3]:
data.describe()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,NormNuc,Mitoses,Unknown,Target
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,0.344778
std,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.475636
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [4]:
data.isnull().sum()

LThick        0
UCellSize     0
UCellShape    0
MAdhesion     0
SECS          0
BlandC        0
NormNuc       0
Mitoses       0
Unknown       0
Target        0
dtype: int64

In [5]:
data.dtypes == int

LThick        False
UCellSize     False
UCellShape    False
MAdhesion     False
SECS          False
BlandC        False
NormNuc       False
Mitoses       False
Unknown       False
Target        False
dtype: bool

A variável 'BlandC' está atualmente definida como string pois possui algumas entradas com '?'. Como resolver este problema? O número de entradas sem valor é significativo?

In [6]:
na_perc = (data.BlandC[data.BlandC == '?']).count() / data.BlandC.count() * 100
print(str(na_perc) + '% percent of entries are null.')

2.28898426323319% percent of entries are null.


Selected aproach: fill missing values with the column's average.

In [7]:
avg = data.BlandC[data.BlandC != '?'].astype(float).mean()
data.loc[data.BlandC == '?', 'BlandC'] = avg
data.BlandC = data.BlandC.astype(int)

Creating a train-test split

In [8]:
data_x = data.drop('Target', axis = 1)
data_y = data.Target

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state=21)

Formating the data to numpy

In [9]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.array
y_test = y_test.array

#### Building the model

In [10]:
cl = Clusterer(x_train=x_train, y_train=y_train, nr_clus=4)
clust_centers, part_matrix, _ = cl.cluster(method='fcm')

ae = AntecedentEstimator(x_train, part_matrix)
antecedent_params = ae.determineMF()

ce = ConsequentEstimator(x_train, y_train, part_matrix)
conseq_params = ce.suglms()

modbuilder = SugenoFISBuilder(antecedent_params, conseq_params, indexes, save_simpful_code=False)
model = modbuilder.get_model()

modtester = SugenoFISTester(model, x_test, indexes)
y_pred_probs = clip(modtester.predict()[0], 0, 1)
y_pred_probs = column_stack((1 - y_pred_probs, y_pred_probs))
y_pred = argmax(y_pred_probs,axis=1)

acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))


 * Detected 4 rules / clusters
 * Detected Sugeno model type
Accuracy: 0.957
Recall: 0.925
Precision Score: 0.961
F1-Score: 0.942
Kappa Score: 0.908


In [11]:
best_metrics = {
    'nr_clus': None,
    'accuracy': {'value': 0, 'nr_clus': None},
    'recall': {'value': 0, 'nr_clus': None},
    'precision': {'value': 0, 'nr_clus': None},
    'f1': {'value': 0, 'nr_clus': None},
    'kappa': {'value': 0, 'nr_clus': None},
}

for nr_clus in range(2, 20):
    print(f"Clustering with {nr_clus} clusters")

    cl = Clusterer(x_train=x_train, y_train=y_train, nr_clus=nr_clus)
    clust_centers, part_matrix, _ = cl.cluster(method='fcm')

    ae = AntecedentEstimator(x_train, part_matrix)
    antecedent_params = ae.determineMF()

    ce = ConsequentEstimator(x_train, y_train, part_matrix)
    conseq_params = ce.suglms()

    modbuilder = SugenoFISBuilder(antecedent_params, conseq_params, indexes, save_simpful_code=False)
    model = modbuilder.get_model()

    modtester = SugenoFISTester(model, x_test, indexes)
    y_pred_probs = clip(modtester.predict()[0], 0, 1)
    y_pred_probs = column_stack((1 - y_pred_probs, y_pred_probs))
    y_pred = argmax(y_pred_probs, axis=1)

    acc_score = accuracy_score(y_test, y_pred)
    rec_score = recall_score(y_test, y_pred)
    prec_score = precision_score(y_test, y_pred)
    F1_score = f1_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    if acc_score > best_metrics['accuracy']['value']:
        best_metrics['accuracy']['value'] = acc_score
        best_metrics['accuracy']['nr_clus'] = nr_clus
    
    if rec_score > best_metrics['recall']['value']:
        best_metrics['recall']['value'] = rec_score
        best_metrics['recall']['nr_clus'] = nr_clus

    if prec_score > best_metrics['precision']['value']:
        best_metrics['precision']['value'] = prec_score
        best_metrics['precision']['nr_clus'] = nr_clus

    if F1_score > best_metrics['f1']['value']:
        best_metrics['f1']['value'] = F1_score
        best_metrics['f1']['nr_clus'] = nr_clus

    if kappa > best_metrics['kappa']['value']:
        best_metrics['kappa']['value'] = kappa
        best_metrics['kappa']['nr_clus'] = nr_clus

print("Best Metrics:")
print("Accuracy: {:.3f} (Number of Clusters: {})".format(best_metrics['accuracy']['value'], best_metrics['accuracy']['nr_clus']))
print("Recall: {:.3f} (Number of Clusters: {})".format(best_metrics['recall']['value'], best_metrics['recall']['nr_clus']))
print("Precision: {:.3f} (Number of Clusters: {})".format(best_metrics['precision']['value'], best_metrics['precision']['nr_clus']))
print("F1 Score: {:.3f} (Number of Clusters: {})".format(best_metrics['f1']['value'], best_metrics['f1']['nr_clus']))
print("Kappa Score: {:.3f} (Number of Clusters: {})".format(best_metrics['kappa']['value'], best_metrics['kappa']['nr_clus']))


Clustering with 2 clusters
 * Detected 2 rules / clusters
 * Detected Sugeno model type
Clustering with 3 clusters
 * Detected 3 rules / clusters
 * Detected Sugeno model type
Clustering with 4 clusters
 * Detected 4 rules / clusters
 * Detected Sugeno model type
Clustering with 5 clusters
 * Detected 5 rules / clusters
 * Detected Sugeno model type
Clustering with 6 clusters
 * Detected 6 rules / clusters
 * Detected Sugeno model type
Clustering with 7 clusters
 * Detected 7 rules / clusters
 * Detected Sugeno model type
Clustering with 8 clusters
 * Detected 8 rules / clusters
 * Detected Sugeno model type
Clustering with 9 clusters
 * Detected 9 rules / clusters
 * Detected Sugeno model type
Clustering with 10 clusters
 * Detected 10 rules / clusters
 * Detected Sugeno model type
Clustering with 11 clusters
 * Detected 11 rules / clusters
 * Detected Sugeno model type
Clustering with 12 clusters
 * Detected 12 rules / clusters
 * Detected Sugeno model type
Clustering with 13 cluster

In [12]:
best_metrics = {
    'nr_clus': None,
    'accuracy': {'value': 0, 'nr_clus': None},
    'recall': {'value': 0, 'nr_clus': None},
    'precision': {'value': 0, 'nr_clus': None},
    'f1': {'value': 0, 'nr_clus': None},
    'kappa': {'value': 0, 'nr_clus': None},
}

for nr_clus in range(2, 20):
    print(f"Clustering with {nr_clus} clusters")

    cl = Clusterer(x_train=x_train, y_train=y_train, nr_clus=nr_clus)
    clust_centers, part_matrix, _ = cl.cluster(method='gk')

    ae = AntecedentEstimator(x_train, part_matrix)
    antecedent_params = ae.determineMF()

    ce = ConsequentEstimator(x_train, y_train, part_matrix)
    conseq_params = ce.suglms()

    modbuilder = SugenoFISBuilder(antecedent_params, conseq_params, indexes, save_simpful_code=False)
    model = modbuilder.get_model()

    modtester = SugenoFISTester(model, x_test, indexes)
    y_pred_probs = clip(modtester.predict()[0], 0, 1)
    y_pred_probs = column_stack((1 - y_pred_probs, y_pred_probs))
    y_pred = argmax(y_pred_probs, axis=1)

    acc_score = accuracy_score(y_test, y_pred)
    rec_score = recall_score(y_test, y_pred)
    prec_score = precision_score(y_test, y_pred)
    F1_score = f1_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    if acc_score > best_metrics['accuracy']['value']:
        best_metrics['accuracy']['value'] = acc_score
        best_metrics['accuracy']['nr_clus'] = nr_clus
    
    if rec_score > best_metrics['recall']['value']:
        best_metrics['recall']['value'] = rec_score
        best_metrics['recall']['nr_clus'] = nr_clus

    if prec_score > best_metrics['precision']['value']:
        best_metrics['precision']['value'] = prec_score
        best_metrics['precision']['nr_clus'] = nr_clus

    if F1_score > best_metrics['f1']['value']:
        best_metrics['f1']['value'] = F1_score
        best_metrics['f1']['nr_clus'] = nr_clus

    if kappa > best_metrics['kappa']['value']:
        best_metrics['kappa']['value'] = kappa
        best_metrics['kappa']['nr_clus'] = nr_clus

print("Best Metrics:")
print("Accuracy: {:.3f} (Number of Clusters: {})".format(best_metrics['accuracy']['value'], best_metrics['accuracy']['nr_clus']))
print("Recall: {:.3f} (Number of Clusters: {})".format(best_metrics['recall']['value'], best_metrics['recall']['nr_clus']))
print("Precision: {:.3f} (Number of Clusters: {})".format(best_metrics['precision']['value'], best_metrics['precision']['nr_clus']))
print("F1 Score: {:.3f} (Number of Clusters: {})".format(best_metrics['f1']['value'], best_metrics['f1']['nr_clus']))
print("Kappa Score: {:.3f} (Number of Clusters: {})".format(best_metrics['kappa']['value'], best_metrics['kappa']['nr_clus']))


Clustering with 2 clusters
 * Detected 2 rules / clusters
 * Detected Sugeno model type
Clustering with 3 clusters
 * Detected 3 rules / clusters
 * Detected Sugeno model type
Clustering with 4 clusters
 * Detected 4 rules / clusters
 * Detected Sugeno model type


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Clustering with 5 clusters
 * Detected 5 rules / clusters
 * Detected Sugeno model type
Clustering with 6 clusters
 * Detected 6 rules / clusters
 * Detected Sugeno model type
Clustering with 7 clusters
 * Detected 7 rules / clusters
 * Detected Sugeno model type
Clustering with 8 clusters
 * Detected 8 rules / clusters
 * Detected Sugeno model type
Clustering with 9 clusters
 * Detected 9 rules / clusters
 * Detected Sugeno model type
Clustering with 10 clusters
 * Detected 10 rules / clusters
 * Detected Sugeno model type
Clustering with 11 clusters
 * Detected 11 rules / clusters
 * Detected Sugeno model type
Clustering with 12 clusters
 * Detected 12 rules / clusters
 * Detected Sugeno model type
Clustering with 13 clusters
 * Detected 13 rules / clusters
 * Detected Sugeno model type
Clustering with 14 clusters
 * Detected 14 rules / clusters
 * Detected Sugeno model type
Clustering with 15 clusters
 * Detected 15 rules / clusters
 * Detected Sugeno model type
Clustering with 16 c