#### Importing and preprocessing the data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pyfume.Clustering import Clusterer
from pyfume.EstimateAntecendentSet import AntecedentEstimator
from pyfume.EstimateConsequentParameters import ConsequentEstimator
from pyfume.SimpfulModelBuilder import SugenoFISBuilder
from pyfume.Tester import SugenoFISTester
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from numpy import clip, column_stack, argmax
from sklearn.neural_network import MLPClassifier

data = pd.read_csv('wbco.csv', header=None)
indexes = ['LThick', 'UCellSize', 'UCellShape', 'MAdhesion', 'SECS', 'BlandC', 'NormNuc', 'Mitoses', 'Unknown', 'Target']
data.columns = indexes

indexes = indexes[:-1]

In [4]:
data.head()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,BlandC,NormNuc,Mitoses,Unknown,Target
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [5]:
data.describe()

Unnamed: 0,LThick,UCellSize,UCellShape,MAdhesion,SECS,NormNuc,Mitoses,Unknown,Target
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,0.344778
std,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.475636
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [6]:
data.isnull().sum()

LThick        0
UCellSize     0
UCellShape    0
MAdhesion     0
SECS          0
BlandC        0
NormNuc       0
Mitoses       0
Unknown       0
Target        0
dtype: int64

In [7]:
data.dtypes == int

LThick        False
UCellSize     False
UCellShape    False
MAdhesion     False
SECS          False
BlandC        False
NormNuc       False
Mitoses       False
Unknown       False
Target        False
dtype: bool

A variável 'BlandC' está atualmente definida como string pois possui algumas entradas com '?'. Como resolver este problema? O número de entradas sem valor é significativo?

In [8]:
na_perc = (data.BlandC[data.BlandC == '?']).count() / data.BlandC.count() * 100
print(str(na_perc) + '% percent of entries are null.')

2.28898426323319% percent of entries are null.


Selected aproach: fill missing values with the column's average.

In [9]:
avg = data.BlandC[data.BlandC != '?'].astype(float).mean()
data.loc[data.BlandC == '?', 'BlandC'] = avg
data.BlandC = data.BlandC.astype(int)

Creating a train-test split

In [10]:
data_x = data.drop('Target', axis = 1)
data_y = data.Target

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state=21)

Formating the data to numpy

In [11]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.array
y_test = y_test.array

#### Building the model

Neural network model

In [12]:
regr = MLPClassifier(hidden_layer_sizes=(31,31,31),random_state=42, max_iter=500)
regr.fit(x_train, y_train)

# %% Get model predictions
y_pred = regr.predict(x_test)

# %% Compute classification metrics
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))


Accuracy: 0.950
Recall: 0.925
Precision Score: 0.942
F1-Score: 0.933
Kappa Score: 0.893
