In [89]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [90]:
np.random.seed(0) #Para que los resultados sean similares siempre

## Convertir los datos

In [91]:
iris = load_iris() 
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [92]:
df = pd.DataFrame(iris.data , columns = iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [93]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [94]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [95]:
#Uso las categorías y nombres de iris, para hacer una columna con las categorias
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Separar los datos en prueba y entrenamiento  

In [96]:
train = df.sample(frac=0.8,random_state=0)
len(train)

120

In [97]:
test = df.sample(frac=0.2,random_state=0)
len(test)

30

## Variables

In [98]:
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [99]:
x_p = test[features]
x_p.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
114,5.8,2.8,5.1,2.4
62,6.0,2.2,4.0,1.0
33,5.5,4.2,1.4,0.2
107,7.3,2.9,6.3,1.8
7,5.0,3.4,1.5,0.2


In [100]:
x = train[features]
x.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
114,5.8,2.8,5.1,2.4
62,6.0,2.2,4.0,1.0
33,5.5,4.2,1.4,0.2
107,7.3,2.9,6.3,1.8
7,5.0,3.4,1.5,0.2


In [101]:
y = pd.factorize(train.species)[0]
y

array([0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 2, 0, 1,
       2, 2, 0, 2, 2, 1, 1, 2, 0, 1, 2, 0, 0, 1, 2, 1, 1, 1, 0, 2, 0, 2,
       2, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 2, 0, 1, 1, 1,
       1, 0, 2, 2, 0, 1, 2, 2, 1, 2, 0, 1, 2, 1, 0, 1, 2, 0, 0, 0, 0, 2,
       2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 1, 0, 0, 2, 2, 2, 1, 1,
       2, 2, 1, 2, 0, 1, 0, 1, 2, 0])

In [102]:
y_p = pd.factorize(test.species)[0]

## Modelo

In [103]:
clf = RandomForestClassifier(random_state=0)
reg = clf.fit(x,y)

In [104]:
#Utilizo la que del df test para ver si el modelo precide bien
pred=reg.predict(x_p)
pred

array([0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 2, 0, 1,
       2, 2, 0, 2, 2, 1, 1, 2])

### Prueba la accurancy del modelo

In [105]:
accurancy = accuracy_score(pred,y_p)
accurancy

1.0

In [106]:
iris.target_names[pred]

array(['setosa', 'versicolor', 'virginica', 'setosa', 'virginica',
       'setosa', 'virginica', 'versicolor', 'versicolor', 'versicolor',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'setosa', 'versicolor', 'virginica', 'virginica', 'setosa',
       'virginica', 'virginica', 'versicolor', 'versicolor', 'virginica'],
      dtype='<U10')

In [107]:
pd.crosstab(index=iris.target_names[pred],columns=test['species'])

species,setosa,versicolor,virginica
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,0,0,6
versicolor,0,13,0
virginica,11,0,0


In [117]:
#Ver los pesos de cada variable
clf.feature_importances_

array([0.08736339, 0.01946283, 0.45809032, 0.43508347])

In [115]:
list(zip(train[features],clf.feature_importances_))

[('sepal length (cm)', 0.08736338713897077),
 ('sepal width (cm)', 0.01946282782024386),
 ('petal length (cm)', 0.45809031929424343),
 ('petal width (cm)', 0.435083465746542)]