In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import load_digits, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from RandomForest import RandomForest

In [3]:
digits = load_digits()

In [4]:
X_train, X_test, y_train, y_test \
= train_test_split(digits.data, digits.target, test_size=0.3, random_state=17)

#### Sklearn randomforest classifier results for reference

In [5]:
from sklearn.ensemble import RandomForestClassifier

Using gini criterion

In [6]:
rfc = RandomForestClassifier(n_estimators=10, random_state=17)
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)
accuracy_score(y_test, preds)

0.9407407407407408

Using entropy criterion

In [7]:
rfc = RandomForestClassifier(n_estimators=10, random_state=17, criterion='entropy')
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)
accuracy_score(y_test, preds)

0.9444444444444444

#### Using Random Forest Implementation 

**Note-** Here 'max_features' are selected using simple implementation of Fisher–Yates shuffle. Sklearn in addition to this avoids using features with constant impurities that might show the difference of 1-2% increase in accuracy.<br>
<br>
Also the dataset subsets are generated here with simple numpy random choice with replacement.

In [8]:
rfc = RandomForest(n_estimators=10, random_seed=17)

In [9]:
rfc.fit(X_train, y_train)

In [10]:
preds= rfc.predict(X_test)

In [11]:
accuracy_score(y_test, preds)

0.9259259259259259

Let's now use entropy.

In [12]:
rfc = RandomForest(n_estimators=10, random_seed=17, criterion='entropy')

In [13]:
rfc.fit(X_train, y_train)

In [14]:
preds = rfc.predict(X_test)

In [15]:
accuracy_score(y_test, preds)

0.912962962962963

## Gridsearch

Tried different 'max_depth' values, but test accuracy is less than when max_depth is not set. 

In [16]:
rfc = RandomForest(n_estimators=10, random_seed=17)

In [17]:
rf_params = {'n_estimators': [10, 20, 40, 50, 100],
             'criterion': ['gini', 'entropy']
            }
rf_grid = GridSearchCV(rfc, rf_params, cv=5, verbose=1, scoring='accuracy')

In [18]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 14.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForest(criterion='gini', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    n_estimators=10, random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [10, 20, 40, 50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [19]:
rf_grid.best_params_, rf_grid.best_score_

({'criterion': 'gini', 'n_estimators': 10}, 0.913283374438753)

In [20]:
preds = rf_grid.predict(X_test)

In [21]:
accuracy_score(y_test, preds)

0.9277777777777778

# Regression Random Forest

We'll use boston house pricing dataset from sklearn for regression problem. Here the criterion we use are variance and mad_median and for scoring we'll use mean_squared_error/neg_mean_squared_error

In [22]:
ds = load_boston()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.3, random_state=17)

### sklearn randomforestregressor results for reference

Using mse criterion

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
rfc_reg = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=17)
rfc_reg.fit(X_train, y_train)
pred_reg = rfc_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

6.269859868421054

Using mae criterion

In [26]:
rfc_reg = RandomForestRegressor(n_estimators=10, criterion='mae', random_state=17)
rfc_reg.fit(X_train, y_train)
pred_reg = rfc_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

6.821703782894737

###  Using Random Forest Implementation

**Note:-** Although generally it is recommended to use 'max_features = total_features/3' for regression random forest, skearn uses all the features so we also tried with all the features

In [27]:
rfc_reg = RandomForest(n_estimators=10, criterion='mse', random_seed=17)
rfc_reg.fit(X_train, y_train)
pred_reg = rfc_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

7.292013651315789

In [28]:
rfc_reg = RandomForest(n_estimators=10, criterion='mae', random_seed=17)
rfc_reg.fit(X_train, y_train)
pred_reg = rfc_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

9.246545394736847

## Gridsearch

In [29]:
rfc_reg = RandomForest(n_estimators=10, random_seed=17, criterion='mse')

In [30]:
rf_params = {'n_estimators': [10, 20, 40, 50],
             'criterion': ['mse', 'mae']
            }
rf_grid = GridSearchCV(rfc_reg, rf_params, cv=5, verbose=1, scoring='neg_mean_squared_error')

In [31]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 10.6min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForest(criterion='mse', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    n_estimators=10, random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['mse', 'mae'],
                         'n_estimators': [10, 20, 40, 50]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=1)

In [32]:
rf_grid.best_params_, rf_grid.best_score_

({'criterion': 'mse', 'n_estimators': 10}, -15.625966314889336)

In [33]:
preds = rf_grid.predict(X_test)

In [34]:
mean_squared_error(y_test, preds)

7.292013651315789

# Using pandas with another dataset.

#### Problem

Predict the presence or absence of cardiovascular disease (CVD) using the patient examination results.

#### Data description

There are 3 types of input features:

- *Objective*: factual information;
- *Examination*: results of medical examination;
- *Subjective*: information given by the patient.

| Feature | Variable Type | Variable      | Value Type |
|---------|--------------|---------------|------------|
| Age | Objective Feature | age | int (days) |
| Height | Objective Feature | height | int (cm) |
| Weight | Objective Feature | weight | float (kg) |
| Gender | Objective Feature | gender | categorical code |
| Systolic blood pressure | Examination Feature | ap_hi | int |
| Diastolic blood pressure | Examination Feature | ap_lo | int |
| Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
| Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
| Smoking | Subjective Feature | smoke | binary |
| Alcohol intake | Subjective Feature | alco | binary |
| Physical activity | Subjective Feature | active | binary |
| Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination.

In [35]:
import pandas as pd
import numpy as np

In [36]:
df = pd.read_csv('../DecisionTree/train.csv', index_col='id', sep=';')

In [37]:
df['age_in_years'] = np.floor(df['age']/365.25)

In [38]:
labels = df['cardio']

In [39]:
df['gender'] =df['gender'].apply(lambda x: x-1)

In [40]:
df['Age_40-50'] = df['age_in_years'].apply(lambda x: 1 if x >= 40 and x < 50 else 0)
df['Age_50-55'] = df['age_in_years'].apply(lambda x: 1 if x >= 50 and x < 55 else 0)
df['Age_55-60'] = df['age_in_years'].apply(lambda x: 1 if x >= 55 and x < 60 else 0)
df['Age_60-65'] = df['age_in_years'].apply(lambda x: 1 if x >= 60 and x < 65 else 0)

In [41]:
df['aphi_120-140'] = df['ap_hi'].apply(lambda x: 1 if x >= 120 and x < 140 else 0)
df['aphi_140-160'] = df['ap_hi'].apply(lambda x: 1 if x >= 140 and x < 160 else 0)
df['aphi_160-180'] = df['ap_hi'].apply(lambda x: 1 if x >= 160 and x < 180 else 0)

In [42]:
df =pd.get_dummies(df, prefix=['cholesterol'], columns=['cholesterol'])

In [43]:
def f(a, b):
    return b/ (a/100)**2

In [44]:
df['bmi'] = df.apply(lambda x: f(x.height, x.weight), axis=1)

In [45]:
df = df.drop(labels=['gluc', 'ap_lo', 'alco', 'age', 'cardio', 'age_in_years', 'ap_hi', 'height', 'weight'], axis=1)

In [46]:
df['bmi'] = np.floor(df['bmi'])

In [47]:
df.head()

Unnamed: 0_level_0,gender,smoke,active,Age_40-50,Age_50-55,Age_55-60,Age_60-65,aphi_120-140,aphi_140-160,aphi_160-180,cholesterol_1,cholesterol_2,cholesterol_3,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,0,1,0,1,0,0,0,0,0,1,0,0,21.0
1,0,0,1,0,0,1,0,0,1,0,0,0,1,34.0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,23.0
3,1,0,1,1,0,0,0,0,1,0,1,0,0,28.0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,23.0


In [48]:
X_train, X_valid, y_train, y_valid = train_test_split(df, labels, test_size=0.3, random_state=17)

In [73]:
rf_pd = RandomForest(n_estimators=10, random_seed=17)

In [74]:
rf_pd.fit(X_train, y_train)

In [75]:
rf_pred = rf_pd.predict(X_valid)

In [76]:
accuracy_score(y_valid, rf_pred)

0.7000952380952381