## Apress - Industrialized Machine Learning Examples

Andreas Francois Vermeulen
2019

### This is an example add-on to a book and needs to be accepted as part of that copyright.

# Chapter 005 Example 012

## Part A - Load Libraries

In [1]:
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
import os

np.random.seed(0)

In [2]:
sfeature=['F01', 'F02', 'F03', 'F04']
rosenames = np.array(['Rosaceae (Blue)','Floribunda (White)','Rosa kordesii (Red)'])

## Part B - Load the Roses dataset

In [3]:
fileName = '../../Data/Roses05.csv'
fileFullName = os.path.abspath(fileName)
print(fileFullName)

C:\Users\AndreVermeulen\Documents\My Book\apress\Industrial Machine Learning\book\GitHub\Upload\industrial-machine-learning\Data\Roses05.csv


In [4]:
datadf= pd.read_csv(fileFullName, header=0)
print(datadf.shape)
print(datadf.columns)

(600, 6)
Index(['F01', 'F02', 'F03', 'F04', 'T01', 'T02'], dtype='object')


In [5]:
data_X = datadf[sfeature].copy(deep=True)

In [6]:
data_y = datadf['T02'].copy(deep=True)
data_y.columns = (['T'])
yc=np.array(data_y, dtype=np.int8)

In [7]:
df=data_X.copy(deep=True)
df.columns=['Leaf Length (mm)','Leaf Width (mm)', 'Stem length (mm)', 'Stem width (mm)']

# View the top 5 rows
print(df.head())

   Leaf Length (mm)  Leaf Width (mm)  Stem length (mm)  Stem width (mm)
0             4.209            2.995             1.043            0.052
1             4.264            3.045             1.011            0.173
2             4.300            3.000             1.100            0.100
3             4.309            2.895             1.343            0.152
4             4.309            2.995             1.243            0.152


In [8]:
# Add a new column with the species names, this is what we are going to try to predict
df['Flower Color'] = pd.Categorical.from_codes(yc,rosenames)

## Part C - Training and Test Data set with np.random.uniform

In [9]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick way of randomly assigning rows to be used as the training data and test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
print(df.head())

   Leaf Length (mm)  Leaf Width (mm)  Stem length (mm)  Stem width (mm)  \
0             4.209            2.995             1.043            0.052   
1             4.264            3.045             1.011            0.173   
2             4.300            3.000             1.100            0.100   
3             4.309            2.895             1.343            0.152   
4             4.309            2.995             1.243            0.152   

      Flower Color  is_train  
0  Rosaceae (Blue)      True  
1  Rosaceae (Blue)      True  
2  Rosaceae (Blue)      True  
3  Rosaceae (Blue)      True  
4  Rosaceae (Blue)      True  


In [10]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [11]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 457
Number of observations in the test data: 143


In [12]:
# Create a list of the feature column's names
features = np.array(df.columns[:4])

# View features
print(features)

['Leaf Length (mm)' 'Leaf Width (mm)' 'Stem length (mm)' 'Stem width (mm)']


## Part D - Apply factorize to Training Data

In [13]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.
y = pd.factorize(train['Flower Color'])[0]

# View target
print(y.reshape(1,-1))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 1 0 0 0 0 0 0 1 2 0 0 0 0 1 1 0 0 0 0 0 0
  0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1
  0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1
  1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 0 0 1 1 2 1 1 1 1 1 0 0 2 1 1 1 0 0 2 1 1
  1 0 0 1 1 2 2 2 0 1 1 2 2 2 1 1 0 0 1 1 1 2 2 0 1 2 1 1 2 1 1 1 2 2 2 0
  1 2 1 1 2 1 2 2 1 1 2 1 1 2 1 2 1 1 2 2 1 1 1 1 2 2 1 1 1 1 2 1 2 1 1 1
  1 2 1 2 1 2 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 2 2 1 2 2 1 2 1 2 1 1 2 2 2 2
  2 2 2 2 2 2 2 2 2 1 2 1 2 1 1 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
  1 2 1 1 2 2 2 2 1 1 2 1 1 2 2 2 1 2 1 1 2 2 2 1 2 1 1 2 2 2 1 2 2 1 2 2
  2 2 1 1 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]]


## Part E - Apply RandomForestClassifier

In [14]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, 
                             random_state=0, 
                             n_estimators=150
                            )

In [15]:
# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Part F - Predict Test Results

In [16]:
# Apply the Classifier we trained to the test data
y_pred=clf.predict(test[features])

In [17]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.97333333, 0.02      , 0.00666667],
       [1.        , 0.        , 0.        ],
       [0.97333333, 0.02      , 0.00666667],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ]])

## Part G - Display the predicted Test Results

In [18]:
# Create actual english names for the plants for each predicted plant class
preds = rosenames[clf.predict(test[features])]

In [19]:
# View the PREDICTED species for the first five observations
print('----------------------------------')
print('Predicted Species')
print('----------------------------------')
print(preds[0:20].reshape(1,-1).T)
print('----------------------------------')
# View the ACTUAL species for the first five observations
print('Actual Species')
print('----------------------------------')
print(test['Flower Color'].head(20))

----------------------------------
Predicted Species
----------------------------------
[['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosa kordesii (Red)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']
 ['Rosaceae (Blue)']]
----------------------------------
Actual Species
----------------------------------
7         Rosaceae (Blue)
8         Rosaceae (Blue)
10        Rosaceae (Blue)
13        Rosaceae (Blue)
17        Rosaceae (Blue)
18        Rosaceae (Blue)
19        Rosaceae (Blue)
20        Rosaceae (Blue)
21        Rosaceae (Blue)
23        Rosaceae (Blue)
27        Rosaceae (Blue)
31        Rosaceae (Blue)
38        Rosaceae (Blue)
52        Rosaceae (Blue)
66    Rosa kordesii (Red)
68        Ro

In [20]:
# Create confusion matrix
pd.crosstab(test['Flower Color'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,Floribunda (White),Rosa kordesii (Red),Rosaceae (Blue)
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rosaceae (Blue),0,0,41
Floribunda (White),53,4,0
Rosa kordesii (Red),0,45,0


In [21]:
# View a list of the features and their importance scores
l=list(zip(train[features], clf.feature_importances_))

for i in range(len(l)):
    print(' %20s : %7.4f %%' % (l[i][0],l[i][1]*100))

     Leaf Length (mm) :  9.3581 %
      Leaf Width (mm) :  3.2167 %
     Stem length (mm) : 43.2882 %
      Stem width (mm) : 44.1370 %


## Done

In [22]:
import datetime
now = datetime.datetime.now()
print('Done!',str(now))

Done! 2019-10-19 17:54:19.095458
