In [1]:
from sklearn.datasets import load_iris, load_digits, load_breast_cancer

data = load_digits()
X,Y = load_digits(return_X_y=True)

#print('Feature names:',data.feature_names)
print('Target names:',data.target_names)

print('X',X)
print('Y',Y)

Target names: [0 1 2 3 4 5 6 7 8 9]
X [[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
Y [0 1 2 ... 8 9 8]


## Training and test set

In [2]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=42)

print('X',X.shape)
print('Y',len(Y))

print('Xtrain',Xtrain.shape)
print('ytrain',len(ytrain))
print('Xtest',Xtest.shape)
print('ytest',len(ytest))

X (1797, 64)
Y 1797
Xtrain (1257, 64)
ytrain 1257
Xtest (540, 64)
ytest 540


## Model

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score
import numpy as np

model = MLPClassifier(random_state=42)

# Ajustar (Entrenar) el modelo con los datos de entrenamiento
model.fit(Xtrain,ytrain)

# Predecir
ypred = model.predict(Xtest)
#print('ytest:',ytest)
#print('ypred:',ypred)
print('accuracy_score',accuracy_score(ytest,ypred))

print('Número de muestras:',len(ytest))
print('Número de predicciones correctas:', np.sum(ytest==ypred))

accuracy_score 0.9666666666666667
Número de muestras: 540
Número de predicciones correctas: 522


In [19]:
from sklearn.metrics import confusion_matrix
import pandas
m = confusion_matrix(ytest, ypred).astype(float)

total = np.sum(m,axis=1)
print('total:',total)

for r,ren in enumerate(m):
    m[r,:] = m[r,:] / total[r]

m = np.round(m,2)
df = pandas.DataFrame(m,columns=data.target_names,index=data.target_names)
df

total: [53. 50. 47. 54. 60. 66. 53. 55. 43. 59.]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.98,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.02,0.93,0.0,0.02,0.0,0.0,0.04,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.92,0.02,0.02,0.0,0.05
6,0.02,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.02
8,0.0,0.02,0.02,0.0,0.0,0.02,0.0,0.0,0.93,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.98


## Breast cancer problem

In [26]:
from sklearn.datasets import load_breast_cancer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
import pandas

In [25]:
data = load_breast_cancer()
X,Y = load_breast_cancer(return_X_y=True)
print(data.feature_names)
print(data.target_names)
print('X',X.shape)
print('Y',len(Y))

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']
X (569, 30)
Y 569


In [27]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=42)

In [28]:
model = MLPClassifier(random_state=42)
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
print('accuracy_score',accuracy_score(ytest,ypred))

accuracy_score 0.9590643274853801




In [29]:
m = confusion_matrix(ytest, ypred).astype(float)
total = np.sum(m,axis=1)
for r,ren in enumerate(m):
    m[r,:] = m[r,:] / total[r]
m = np.round(m,2)
df = pandas.DataFrame(m,columns=data.target_names,index=data.target_names)
df

Unnamed: 0,malignant,benign
malignant,0.94,0.06
benign,0.03,0.97


In [33]:
#print('ytest',ytest)
#print('ypred',ypred)

tp =  np.sum(np.logical_and(ytest==1 , ypred==1)) # True positives
fp =  np.sum(np.logical_and(ytest==0 , ypred==1)) # False positives
tn =  np.sum(np.logical_and(ytest==0 , ypred==0)) # True negatives
fn =  np.sum(np.logical_and(ytest==1 , ypred==0)) # False negatives
print('total:',len(ytest))
print('tp:',tp)
print('fp:',fp)
print('tn:',tn)
print('fn:',fn)

total: 171
tp: 105
fp: 4
tn: 59
fn: 3


In [38]:
# Si me da positivo la prueba, ¿Qué tan confiable es?
# Precision:  = 𝑡𝑝 / (𝑡𝑝+𝑓𝑝)    Los que de verdad eran positivos / Todas las predicciones positivas
print('precision:',tp/(tp+fp))
print('precision:',precision_score(ytest,ypred))

# De todos los realmente enfermos, ¿Cuántos detecta la prueba?
# Recall = 𝑡𝑝 / (𝑡𝑝+𝑓𝑛)  Los que de verdad eran positivos / Todos los que eran positivos
print('recall:',tp/(tp+fn))
print('recall:',recall_score(ytest,ypred))

# Si me da negativo la prueba, ¿Qué tan confiable es?
print(tn/(tn+fn))

print('f1:',f1_score(ytest,ypred))

precision: 0.963302752293578
precision: 0.963302752293578
recall: 0.9722222222222222
recall: 0.9722222222222222
0.9516129032258065
f1: 0.9677419354838711


## Digits


In [39]:
from sklearn.datasets import load_breast_cancer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
import pandas

In [44]:
data = load_digits()
X,Y = load_digits(return_X_y=True)
print(data.target_names)
print('X',X.shape)
print('Y',len(Y))

[0 1 2 3 4 5 6 7 8 9]
X (1797, 64)
Y 1797


In [45]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=42)

In [46]:
model = MLPClassifier(random_state=42)
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
print('accuracy_score',accuracy_score(ytest,ypred))

accuracy_score 0.9666666666666667


In [47]:
m = confusion_matrix(ytest, ypred).astype(float)
total = np.sum(m,axis=1)
for r,ren in enumerate(m):
    m[r,:] = m[r,:] / total[r]
m = np.round(m,2)
df = pandas.DataFrame(m,columns=data.target_names,index=data.target_names)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.98,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.02,0.93,0.0,0.02,0.0,0.0,0.04,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.92,0.02,0.02,0.0,0.05
6,0.02,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.02
8,0.0,0.02,0.02,0.0,0.0,0.02,0.0,0.0,0.93,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.98


In [50]:
print('macro-precision:',precision_score(ytest,ypred,average='macro'))
print('macro-recall:',recall_score(ytest,ypred,average='macro'))
print('macro-f1:',f1_score(ytest,ypred,average='macro'))

macro-precision: 0.9656300897606871
macro-recall: 0.9667534088527091
macro-f1: 0.9657894958095312


In [57]:
# Calcular el balance de los datos
print('Y',Y)
print( np.unique(Y,return_counts=True) )

Y [0 1 2 ... 8 9 8]
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180], dtype=int64))


# Imbalance dataset

In [64]:
# Imbalance
ytest = [1,1,1,1,1,1,1,1,1,0]    
ypred = [1,1,1,1,1,1,1,1,1,1] # El modelo siempre dice 1 sin importar nada
print('accuracy:',accuracy_score(ytest,ypred))
print('f1:',f1_score(ytest,ypred))
print('macro f1:',f1_score(ytest,ypred,average='macro'))

m = confusion_matrix(ytest, ypred).astype(float)
total = np.sum(m,axis=1)
for r,ren in enumerate(m):
    m[r,:] = m[r,:] / total[r]
m = np.round(m,3)
df = pandas.DataFrame(m,columns=['0','1'],index=['0','1'])
print(df)

print(' ')
ytest = [0,0,0,0,0,0,0,0,0,1]
ypred = [0,0,0,0,0,0,0,0,0,0] # El modelo siempre dice 0 sin importar nada
print('accuracy:',accuracy_score(ytest,ypred))
print('f1:',f1_score(ytest,ypred))
print('macro f1:',f1_score(ytest,ypred,average='macro'))

accuracy: 0.9
f1: 0.9473684210526316
macro f1: 0.4736842105263158
     0    1
0  0.0  1.0
1  0.0  1.0
 
accuracy: 0.9
f1: 0.0
macro f1: 0.4736842105263158
