In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

In [2]:
simplefilter("ignore", category=ConvergenceWarning)
plt.rcParams['figure.figsize'] = [16, 10]

In [3]:
# dataset = pd.read_csv("pima-indians-diabetes.csv")
dataset = pd.read_excel("Dry_Bean_Dataset.xlsx")

In [4]:
dataset.head(5)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [5]:
dataset.Class = pd.Categorical(dataset.Class)
dataset["label"] = dataset.Class.cat.codes
dataset.head(5)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class,label
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER,5
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER,5
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER,5
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER,5
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER,5


In [6]:
ft_columns = dataset.columns[0:len(dataset.columns) - 2]

In [9]:
X = dataset.loc[:, ft_columns]
Y = dataset["label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.25, random_state = 10)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state = 10)

In [11]:
X, Y

(        Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
 0      28395    610.291       208.178117       173.888747      1.197191   
 1      28734    638.018       200.524796       182.734419      1.097356   
 2      29380    624.110       212.826130       175.931143      1.209713   
 3      30008    645.884       210.557999       182.516516      1.153638   
 4      30140    620.134       201.847882       190.279279      1.060798   
 ...      ...        ...              ...              ...           ...   
 13606  42097    759.696       288.721612       185.944705      1.552728   
 13607  42101    757.499       281.576392       190.713136      1.476439   
 13608  42139    759.321       281.539928       191.187979      1.472582   
 13609  42147    763.779       283.382636       190.275731      1.489326   
 13610  42159    772.237       295.142741       182.204716      1.619841   
 
        Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
 0  

In [12]:
k_scores_train = []
k_scores_train_full = []
k_scores_valid = []

# use iteration to caclulator different kernels in models, then return the average accuracy based on the cross validation
for k in range(1,100): #10, 10
      clf = MLPClassifier(hidden_layer_sizes=(k,), random_state=10, early_stopping=True, n_iter_no_change=5)
      scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
      k_scores_train.append(scores.mean())
      clf.fit(X_train, y_train)
      k_scores_train_full.append(clf.score(X_train, y_train))
      k_scores_valid.append(clf.score(X_valid, y_valid))

# plot to see clearly
plt.plot(range(1,100), k_scores_train)
plt.plot(range(1,100), k_scores_train_full)
plt.plot(range(1,100), k_scores_valid)
plt.legend(('Trainamento', 'Trainamento Full', 'Validacao'),
           loc='upper center', shadow=True)
plt.xlabel('Hidden nodes in a single layer')
plt.ylabel('Cross-Validated Accuracy')
plt.show()