In [1]:
from sklearn.datasets import load_iris 

from sklearn.ensemble import RandomForestClassifier

import pandas as pd 

import numpy as np 


In [2]:
np.random.seed(0)

In [3]:
iris = load_iris()

In [4]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [5]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
df['species']=pd.Categorical.from_codes(iris.target, iris.target_names)

In [7]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
#un indice equivalente al random del dataset, con 80%
trainRandom = np.random.rand(len(df))<0.8

In [9]:
#sacamos ese 80% para entrenar
train = df[trainRandom]

In [10]:
#y el 20% restante para probar, importante que el simbolo ~ significa un 'no esta en'
test = df[~trainRandom]

In [11]:
print('Numero de obs en el set de entrenamiento =', len(train))

Numero de obs en el set de entrenamiento = 123


In [12]:
print('Numero de obs en el set de prueba =', len(test))

Numero de obs en el set de prueba = 27


In [13]:
#Tomamos las columnas que usaremos para entrenar el modelo
features = df.columns[:4]

In [14]:
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [15]:
df.species

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [16]:
#reemplazamos los valores de texto a numericos factorizando 
y = pd.factorize(train['species'])[0]

In [17]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [18]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [19]:
#el mensaje que sale, es solo un warning  que no afecta, solo nos da info del algoritmo
clf.fit(train[features],y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [20]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2], dtype=int64)

In [21]:
preds = iris.target_names[clf.predict(test[features])]

In [22]:
#muestro las 1ras 15
preds[0:15]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'versicolor'],
      dtype='<U10')

In [23]:
test['species'].head()

7     setosa
8     setosa
13    setosa
17    setosa
19    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [24]:
pd.crosstab(test['species'], preds, rownames=['Actual species'],colnames=['predicted species'])

predicted species,setosa,versicolor,virginica
Actual species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,8,0,0
versicolor,0,6,1
virginica,0,0,12


In [25]:
list(zip(train[features], clf.feature_importances_))

[('sepal length (cm)', 0.12304913856975667),
 ('sepal width (cm)', 0.025037236898720915),
 ('petal length (cm)', 0.3542535406803057),
 ('petal width (cm)', 0.49766008385121674)]