# Random Forest Practice Lab

In [23]:
import pandas as pd
import pylab as plt
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn import metrics
import random

In this lab we'll look at Random Forest models using a customer churn dataset. The goal is to predict which customers will stay and which will leave, based on some subscription and usage metrics.

In [24]:
# Helper methods for accuracy and correlations
def get_accuracy(X_train, X_test, y_train, y_test, model):
    return  {
        "test Accuracy": metrics.accuracy_score(y_test, model.predict(X_test)),
        "train Accuracy": metrics.accuracy_score(y_train, model.predict(X_train))
        }
    
def get_correlation(X_test, y_test,models):
    #This function calculates the average correlation between predictors  
    n_estimators=len(models.estimators_)
    prediction=np.zeros((y_test.shape[0],n_estimators))
    predictions=pd.DataFrame({'estimator '+str(n+1):[] for n in range(n_estimators)})
    
    for key,model in zip(predictions.keys(),models.estimators_):
        predictions[key]=model.predict(X_test.to_numpy())
    
    corr=predictions.corr()
    print("Average correlation between predictors: ", corr.mean().mean()-1/n_estimators)
    return corr

In [25]:
df = pd.read_csv("./data/ChurnData.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 28 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tenure    200 non-null    float64
 1   age       200 non-null    float64
 2   address   200 non-null    float64
 3   income    200 non-null    float64
 4   ed        200 non-null    float64
 5   employ    200 non-null    float64
 6   equip     200 non-null    float64
 7   callcard  200 non-null    float64
 8   wireless  200 non-null    float64
 9   longmon   200 non-null    float64
 10  tollmon   200 non-null    float64
 11  equipmon  200 non-null    float64
 12  cardmon   200 non-null    float64
 13  wiremon   200 non-null    float64
 14  longten   200 non-null    float64
 15  tollten   200 non-null    float64
 16  cardten   200 non-null    float64
 17  voice     200 non-null    float64
 18  pager     200 non-null    float64
 19  internet  200 non-null    float64
 20  callwait  200 non-null    float6

In [26]:
features = ['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless']
target = "churn"

churn_df = pd.concat((df[features], df[target]), axis=1)

To illustrate how the bootstrapping phase of Bagging (Bootstrap AGGregating) works we first have to look at resampling. This takes a subset of the data and resamples it, with replacement (meaning they can be chosen multiple times). This can be seen below, where some rows are repeated.

In [27]:
from sklearn.utils import resample

resample(churn_df[0:5])

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1.0


Another key feature of Bagging is that only a subset of features is used each iteration, this can be illustrated by randomly sampling the columns of our dataframe. We will choose $m$ out of $M$ variables randomly. 

The final line combines both resampling with features randomization to get the full Bagging experience.

In [28]:
M = churn_df.shape[1]
m = 3
feature_index = range(M)
selected_features = random.sample(feature_index, m)

resample(churn_df).iloc[:, selected_features]

Unnamed: 0,age,wireless,employ
85,30.0,0.0,4.0
150,43.0,0.0,17.0
41,57.0,0.0,7.0
74,27.0,0.0,1.0
163,37.0,1.0,8.0
...,...,...,...
91,22.0,1.0,0.0
131,43.0,0.0,4.0
55,35.0,0.0,8.0
44,33.0,1.0,7.0


## Train/Test data splitting and sklearn bagging

In [29]:
from sklearn.model_selection import train_test_split

X = churn_df[features]
y = churn_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

Next we'll build a BaggingClassifier with a basic DecisionTreeClassifier as the base estimator. Since Decision Trees tend to overfit, bagging counteracts this.

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

n_estimators = 20
bagging_classifier = BaggingClassifier(
    estimator=DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=2),
    n_estimators=n_estimators,
    random_state=0,
    bootstrap=True
)

bagging_classifier.fit(X_train, y_train)

get_accuracy(X_train, X_test, y_train, y_test, bagging_classifier)

{'test Accuracy': 0.7666666666666667, 'train Accuracy': 0.9214285714285714}

In [35]:
get_correlation(X_test, y_test, bagging_classifier).style.background_gradient(cmap='coolwarm')

Average correlation between predictors:  0.21390351918105088


Unnamed: 0,estimator 1,estimator 2,estimator 3,estimator 4,estimator 5,estimator 6,estimator 7,estimator 8,estimator 9,estimator 10,estimator 11,estimator 12,estimator 13,estimator 14,estimator 15,estimator 16,estimator 17,estimator 18,estimator 19,estimator 20
estimator 1,1.0,-0.040079,0.187491,0.251011,0.092462,0.156777,0.30757,0.450236,0.256928,0.297765,-0.060684,0.443785,0.232262,0.231759,0.250759,0.141595,0.188113,0.188113,0.28319,-0.087274
estimator 2,-0.040079,1.0,-0.002979,0.335171,0.349647,-0.011512,-0.096314,0.065795,0.124341,0.180022,0.516877,-0.074605,-0.078409,-0.090909,0.325626,0.409194,0.04413,0.04413,0.215365,-0.059131
estimator 3,0.187491,-0.002979,1.0,0.395985,-0.010903,0.397467,0.402147,0.558495,0.279553,0.365851,-0.034586,0.51917,0.552099,0.010428,0.225528,0.224807,0.561502,0.47973,0.345857,0.006783
estimator 4,0.251011,0.335171,0.395985,1.0,0.456572,0.155747,0.383257,0.456305,0.257287,0.417131,0.119978,0.415618,0.340807,0.067182,0.273887,0.205677,0.375523,0.294475,0.445634,0.19496
estimator 5,0.092462,0.349647,-0.010903,0.456572,1.0,0.194834,-0.035245,0.154782,-0.045502,0.352707,0.413758,-0.045502,0.099322,0.023762,0.386847,0.21673,0.068323,0.161491,0.394055,-0.100465
estimator 6,0.156777,-0.011512,0.397467,0.155747,0.194834,1.0,0.388449,0.335347,0.231455,0.153218,0.116927,0.231455,0.355529,0.040291,0.512296,0.116927,0.589768,0.352808,0.400892,0.026207
estimator 7,0.30757,-0.096314,0.402147,0.383257,-0.035245,0.388449,1.0,0.39036,0.464758,0.41352,-0.111803,0.568038,0.688847,0.13484,0.080115,0.111803,0.599171,0.387699,0.089443,0.219265
estimator 8,0.450236,0.065795,0.558495,0.456305,0.154782,0.335347,0.39036,1.0,0.377964,0.395483,0.03637,0.545949,0.437978,0.180937,0.336194,0.127294,0.584731,0.584731,0.363696,-0.042796
estimator 9,0.256928,0.124341,0.279553,0.257287,-0.045502,0.231455,0.464758,0.377964,1.0,0.234895,-0.096225,0.377778,0.188639,0.087039,0.186171,0.096225,0.318511,0.045502,0.11547,-0.113228
estimator 10,0.297765,0.180022,0.365851,0.417131,0.352707,0.153218,0.41352,0.395483,0.234895,1.0,0.332877,0.405727,0.357359,-0.044607,0.128542,0.517809,0.352707,0.265259,0.40685,0.07979


## Random Forest

With Random forest bagging is combined with feature randomization; only a subset of available features is used with each split within the decision trees during training. This makes the correlations between trees smaller than in standard Bagging strategies.

In [37]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 20
M_features = X.shape[1]
# Commonly used m = sqrt(M)
max_features = round(np.sqrt(M_features))-1

model = RandomForestClassifier(
    max_features=max_features,
    n_estimators=n_estimators, 
    random_state=0
    )

model.fit(X_train,y_train)
get_accuracy(X_train, X_test, y_train, y_test, model)

{'test Accuracy': 0.8, 'train Accuracy': 1.0}

In [38]:
get_correlation(X_test, y_test,model).style.background_gradient(cmap='coolwarm')

Average correlation between predictors:  0.20726504096013737


Unnamed: 0,estimator 1,estimator 2,estimator 3,estimator 4,estimator 5,estimator 6,estimator 7,estimator 8,estimator 9,estimator 10,estimator 11,estimator 12,estimator 13,estimator 14,estimator 15,estimator 16,estimator 17,estimator 18,estimator 19,estimator 20
estimator 1,1.0,0.228852,0.250759,0.250759,0.09711,0.292623,0.207804,0.207804,0.118208,0.231759,0.273673,0.500636,0.188113,0.1635,0.256928,0.140275,0.048791,0.064474,0.414727,0.129927
estimator 2,0.228852,1.0,0.126674,0.582699,0.308607,0.14825,0.370625,0.2965,0.339993,0.213201,0.0,0.387829,0.278639,0.163299,0.408248,0.13325,0.213201,0.030457,0.293151,0.334664
estimator 3,0.250759,0.126674,1.0,0.383825,0.179825,0.101408,0.101408,-0.048826,0.208052,0.237661,0.336194,0.347579,0.217425,0.4344,0.351657,0.15664,0.15664,0.233031,0.399702,0.151404
estimator 4,0.250759,0.582699,0.383825,1.0,0.414379,0.176526,0.476996,0.251643,0.446583,0.075619,0.258009,0.494962,0.302136,0.4344,0.517143,0.15664,0.15664,0.140436,0.399702,0.369426
estimator 5,0.09711,0.308607,0.179825,0.414379,1.0,0.282131,0.282131,0.053376,0.234061,0.098693,0.126984,0.456305,0.068792,0.125988,0.20998,0.016449,-0.065795,0.347774,0.098693,0.110657
estimator 6,0.292623,0.14825,0.101408,0.176526,0.282131,1.0,0.120879,-0.098901,0.081422,0.268659,0.205879,0.212015,0.421344,0.302614,0.221917,0.110624,0.189642,0.55538,0.268659,0.372104
estimator 7,0.207804,0.370625,0.101408,0.476996,0.282131,0.120879,1.0,0.047619,0.158966,0.110624,-0.022875,0.499493,0.256111,0.221917,0.383311,0.347677,0.189642,0.194157,0.110624,0.23035
estimator 8,0.207804,0.2965,-0.048826,0.251643,0.053376,-0.098901,0.047619,1.0,0.236511,0.189642,0.205879,0.068276,0.173494,-0.020174,0.060523,-0.04741,-0.126428,-0.07676,0.189642,0.159473
estimator 9,0.118208,0.339993,0.208052,0.446583,0.234061,0.081422,0.158966,0.236511,1.0,0.039031,0.153351,0.264986,0.352707,0.064062,0.320311,0.039031,0.039031,0.084435,0.206308,0.293836
estimator 10,0.231759,0.213201,0.237661,0.075619,0.098693,0.268659,0.110624,0.189642,0.039031,1.0,0.427669,0.299735,0.29109,0.261116,0.087039,-0.107955,0.147727,0.006494,0.147727,0.407718


## Random Forest With cancer cell data

In [40]:
df = pd.read_csv("./data/cell_samples.csv")
df= df[pd.to_numeric(df['BareNuc'], errors='coerce').notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           683 non-null    int64 
 1   Clump        683 non-null    int64 
 2   UnifSize     683 non-null    int64 
 3   UnifShape    683 non-null    int64 
 4   MargAdh      683 non-null    int64 
 5   SingEpiSize  683 non-null    int64 
 6   BareNuc      683 non-null    object
 7   BlandChrom   683 non-null    int64 
 8   NormNucl     683 non-null    int64 
 9   Mit          683 non-null    int64 
 10  Class        683 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 64.0+ KB


In [43]:
feature_cols = ['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit']
target = "Class"

X = df[feature_cols]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

As usual, gridsearch also works fine with a randomforest classifier. Some of the main parameters are:
- number of estimators
- max depth of trees
- max features to use during feature randomisation

In [45]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

param_grid = {
    'n_estimators': [2*n+1 for n in range(20)],
    'max_depth' : [2*n+1 for n in range(10) ],
    'max_features':["auto", "sqrt", "log2"]
    }

search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy')
search.fit(X_train, y_train)

1000 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Li

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [1, 3, ...], 'max_features': ['auto', 'sqrt', ...], 'n_estimators': [1, 3, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,35
,criterion,'gini'
,max_depth,11
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
get_accuracy(X_train, X_test, y_train, y_test, search.best_estimator_)

{'test Accuracy': 0.9708029197080292, 'train Accuracy': 1.0}

Which clearly pays off! The test accuracy is 97% now, and a perfect accuracy on the training set.

## Random Forest on Drug Data



In [47]:
df = pd.read_csv("./data/drug200.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


First get the desired features, then encode the categorical features using a LabelEncoder. Then we can split the data in train/test and start fitting the model.

In [49]:
from sklearn import preprocessing

X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
y = df["Drug"]

le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 

le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])

le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

To fit the model, we once again use gridsearch to find the best hyperparameters.

In [51]:
param_grid = {
    'n_estimators': [2*n+1 for n in range(20)],
    'max_depth' : [2*n+1 for n in range(10) ],
    'max_features':["auto", "sqrt", "log2"]
    }

model = RandomForestClassifier()

search = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=3)
search.fit(X_train, y_train)
search.best_params_

600 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "p:\02_Poetry\Cache\virtualenvs\ibm-ml-sD2suF2h-py3.12\Lib\

{'max_depth': 5, 'max_features': 'log2', 'n_estimators': 21}

In [52]:
get_accuracy(X_train, X_test, y_train, y_test, search.best_estimator_)

{'test Accuracy': 0.95, 'train Accuracy': 1.0}