In [26]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap.umap_ as umap

In [3]:
df_data = pd.read_csv('../data/train.csv')
df_data.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


## Génération des données d'entrainement et de test

In [24]:
X = df_data.copy().drop(['id', 'species'],axis='columns').values

species = df_data['species'].unique()

# Création des vecteurs one-hot pour la matrice de targets
t = np.zeros((len(X),species.size)) 
for i in range(species.size):
    t[df_data['species'] == species[i], i] = 1

In [32]:
np.sum(t, axis=0)

array([10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10.])

Nous séparons en deux et de manièere aléatoire le jeu de données d'origine avec : 
- 70% qui constituerons les données d'entrainement;
- 30% qui constitueront les données de test;

De plus, nous avons dans notre dataset 99 espaces représentées chacune par 10 individus.
Si nous appliquons un ration 70% pour l'entrainement, 30% pour le test, nous aurons en moyenne :
- 7 individus par espèce pour les données d'entrainement
- 3 individus par espèce pour les données de test

In [33]:
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.3, random_state=0)

Nous obtenons ainsi les résultat suivant :

In [34]:
print("size X_train: "+str(X_train.shape))
print("size t_train: "+str(t_train.shape))
print()
print("size X_test: "+str(X_test.shape))
print("size t_test: "+str(t_test.shape))

size X_train: (693, 192)
size t_train: (693, 99)

size X_test: (297, 192)
size t_test: (297, 99)


## Entrainement du modèle

In [39]:
model = MLPClassifier(random_state=1, max_iter=10000)
model.fit(X_train, t_train)
model.score(X_train,t_train)

1.0

## Test du model

In [40]:
model.score(X_test, t_test)

0.7946127946127947