# Importing packages 

In [1]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 

# Ploting 
import seaborn as sns
import matplotlib.pyplot as plt

# List iteration tracking
from tqdm import tqdm

# Importing the custom written class 
from DecisionTree import Node 

# Importing the custom regression tree 
from RandomForest import RandomForestClassifier, RandomForestTree

# Time tracking
import time

# Precision metrics 
from sklearn.metrics import precision_score, recall_score

# Reading data 

The data regards telecom churn. 

The objective is to create a model that predicts whether a customer will quit using the features available.

In [2]:
d = pd.read_csv('data/random_forest/telecom_churn.csv')

In [3]:
print(f"Data shape: {d.shape}")

Data shape: (3333, 11)


In [4]:
d.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [5]:
d.dtypes

Churn                int64
AccountWeeks         int64
ContractRenewal      int64
DataPlan             int64
DataUsage          float64
CustServCalls        int64
DayMins            float64
DayCalls             int64
MonthlyCharge      float64
OverageFee         float64
RoamMins           float64
dtype: object

In [6]:
# Distribution of churn in data 
d.groupby('Churn').size()

Churn
0    2850
1     483
dtype: int64

# Random forest - quick theory review

The classifier which will be created is a random forest classifier. 

Lets denote it as **rf()**.  

Given a set of input matrix $\mathbb{X}_{nxp}$ the classifier **rf()** outputs either 1 or 0.

$$rf: \mathbb{X} \rightarrow \{1, 0\}$$

The algorithm of the random forest grows **k** decision trees. 

The final prediction of the **rf()** classifier is a majority vote: the input matrix $\mathbb{X}$ is used with each of the **k** trees, and then the class with the most outputs wins. 

In the notebook about decision trees it is clear that with the same input and the same hyperparameters, the same output and the same rules will be learnt by a decision tree. So why grow **k** of them? 

## Data bootstrapping

The random in the random forest starts at the data sample creation for each of the decision trees. The technique used in creating **k** datasamples is bootstrapping

Given a dataset of n rows and p features: we sample the rows from the original dataset with replacement. For every new decision tree *i*, a new bootsrapped dataset is created: $\mathbb{X_{b}^{i}}$.

For example, lets assume that the whole dataset has 10 rows of data:

In [7]:
# Lets imagine this the whole dataset
dsubset = d.sample(10).copy()[['Churn', 'DataPlan', 'DayMins', 'OverageFee']]
dsubset.reset_index(inplace=True, drop=True)

print(dsubset)

   Churn  DataPlan  DayMins  OverageFee
0      0         0    156.2        4.50
1      0         0    259.3        8.76
2      0         0    247.4        8.80
3      0         0    149.7       10.63
4      0         1    155.9        8.12
5      0         0    191.0       15.94
6      0         0    146.0        5.49
7      0         0    203.7       10.82
8      0         0    162.7       14.60
9      0         0    219.4       11.29


To create 3 more random bootsrapped samples we use the pandas function **sample(replace=True)**. The key concept is that the sampling is done *with replacement*: the same rows might appear several times in our sample. 

In [8]:
for i, _ in enumerate(range(3)):
    print("----- \n")
    print(f"Boostrapped sample: {i + 1} \n")
    print(dsubset.sample(frac=1.0, replace=True))
    print("----- \n")

----- 

Boostrapped sample: 1 

   Churn  DataPlan  DayMins  OverageFee
4      0         1    155.9        8.12
1      0         0    259.3        8.76
1      0         0    259.3        8.76
4      0         1    155.9        8.12
4      0         1    155.9        8.12
5      0         0    191.0       15.94
5      0         0    191.0       15.94
5      0         0    191.0       15.94
0      0         0    156.2        4.50
4      0         1    155.9        8.12
----- 

----- 

Boostrapped sample: 2 

   Churn  DataPlan  DayMins  OverageFee
7      0         0    203.7       10.82
8      0         0    162.7       14.60
4      0         1    155.9        8.12
2      0         0    247.4        8.80
1      0         0    259.3        8.76
4      0         1    155.9        8.12
0      0         0    156.2        4.50
1      0         0    259.3        8.76
9      0         0    219.4       11.29
4      0         1    155.9        8.12
----- 

----- 

Boostrapped sample: 3 

   Churn

For each of the **k** trees grown in random forest, we create **k** bootstrapped data samples. 

## Feature selection at each split 

Now that we have a dataset $\mathbb{X_{b}^{i}}$ for each of the **k** trees the final part is to determine the splitting criterion for the creation of the nodes. 

In the classification case, the gini gain criterion is the same as in the simple decision tree case. The difference is that at each node splitting, a random subsample of collumns are select to find the "best split". 

For example, if we have 10 collumns as features and we select the hyperparameter of **X_features_fraction = 0.8** then at each node where the best split is beeing calculated, we would select 8 random features (10 * 0.8 = 8).  

# Features to use 

The bellow feature list will be used in the creation of the random forest. 

In [9]:
# Defining the feature list used in the growth of the tree
features = [
    'AccountWeeks',
    'DataUsage',
    'DayMins',
    'DayCalls',
    'MonthlyCharge',
    'OverageFee',
    'RoamMins'
]

In [10]:
d[features + ['Churn']].sample(10)

Unnamed: 0,AccountWeeks,DataUsage,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,Churn
977,93,3.19,134.2,105,68.9,8.13,11.8,1
3104,63,2.78,214.2,61,79.8,9.06,10.3,0
355,31,0.28,166.1,105,37.8,3.97,12.7,0
845,144,2.7,283.9,98,92.0,9.6,10.0,0
1943,125,0.0,168.6,99,44.0,8.78,10.9,0
1219,36,0.0,178.6,83,49.0,10.66,10.9,0
94,90,0.0,179.1,71,47.0,9.53,10.6,0
203,105,0.0,140.6,109,40.0,8.93,6.8,0
1423,127,2.81,95.9,117,58.1,7.98,10.4,0
1370,96,0.0,179.5,125,45.0,8.12,6.6,0


# Creating the train and test sets 

In [11]:
# Fraction of rows in the training set 
train_share = 0.75

# Creating the train and test sets
train = d.sample(frac=train_share)
test = d[~d.index.isin(train.index)].copy()

print(f"Total rows in the dataset: {d.shape[0]}")
print(f"Rows in training set: {train.shape[0]}")
print(f"Rows in test set: {test.shape[0]}")

Total rows in the dataset: 3333
Rows in training set: 2500
Rows in test set: 833


# Training the random forest 

In [12]:
# Initiating the random forest object 
rf = RandomForestClassifier(
    Y=train['Churn'], 
    X=train[features],
    min_samples_split=5,
    max_depth=3,
    n_trees=30, # Number of trees grown
    X_features_fraction=0.75
    )

# Growing the random forest 
rf.grow_random_forest()

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [02:34<00:00,  5.14s/it]


In [13]:
# Printing out the trees
if rf.n_trees < 10:
    rf.print_trees()

# Predictions

In [14]:
yhat = rf.predict(test[features])
test['yhat'] = yhat

print(f"Total churns in test set: {test['Churn'].sum()}")
print(f"Total predicted churns in test set: {test['yhat'].sum()}")

print(f"Precision: {round(precision_score(test['Churn'], test['yhat']), 2) * 100} %")
print(f"Recall: {round(recall_score(test['Churn'], test['yhat']), 2) * 100} %")

Total churns in test set: 116
Total predicted churns in test set: 44
Precision: 75.0 %
Recall: 28.000000000000004 %


# Sklearn implementation 

We can compare the custom implementation of RF to that of skicit learn. 

In [15]:
# Skicit learn implementation
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierScikit

# Initiating
rf_scikit = RandomForestClassifierScikit(n_estimators=30, max_features=0.75, max_depth=3, min_samples_split=5)

# Fitting 
start = time.time()
rf_scikit.fit(X=train[features], y=train['Churn'])
print(f"Time took for scikit learn: {round(time.time() - start, 2)} seconds")

# Forecasting 
yhatsc = rf_scikit.predict(test[features])
test['yhatsc'] = yhatsc

print(f"Total churns in test set: {test['Churn'].sum()}")
print(f"Total predicted churns in test set: {test['yhat'].sum()}")

print(f"Precision: {round(precision_score(test['Churn'], test['yhatsc']), 2) * 100} %")
print(f"Recall: {round(recall_score(test['Churn'], test['yhatsc']), 2) * 100} %")

Time took for scikit learn: 0.05 seconds
Total churns in test set: 116
Total predicted churns in test set: 44
Precision: 79.0 %
Recall: 26.0 %
