Theory : https://towardsdatascience.com/understanding-random-forest-58381e0602d2

Examples : https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76

# Forests of randomized trees

In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

import matplotlib.pyplot as plt

from sklearn.utils import resample

### Set random seed to ensure reproducible runs

In [51]:
RSEED = 50

### Importing data and data cleaning

In [89]:
df_main = pd.read_csv('/home/colili/Documents/PhD/project_john/DataFrame_produced/df_main.csv', index_col=0)
df_main.columns

Index(['Name', 'surge_reduced', 'thick_w0', 'thick_w1', 'thick_w2', 'thick_w3',
       'thick_w4', 'thick_w5', 'thick_w6', 'thick_w7',
       ...
       'Z_mean2', 'Z_mean3', 'Z_mean4', 'Z_mean5', 'Z_mean6', 'Z_mean7',
       'Z_mean8', 'Z_mean9', 'Z_mean10', 'Z_mean11'],
      dtype='object', length=134)

In [53]:
df_main.drop(df_main[df_main['Surge']==9].index, inplace = True)
df_main.drop(df_main[df_main['Surge']==1].index, inplace = True)
df_main.drop(df_main[df_main['Surge']==2].index, inplace = True)
df = df_main.drop(columns = ['rgiid', 'Name', 'x', 'y', 'ELA', 'beta'])
df.dropna(how='any', inplace=True)

In [6]:
df

Unnamed: 0,Surge,Slope,bed,thickness,surface_elevation,cmb,width_centerline
0,0,16.7,401.83295,4.167050,406.00000,0.004318,86.307950
1,0,16.7,401.83295,4.167050,406.00000,0.004320,52.001931
2,0,16.7,401.83295,4.167050,406.00000,0.004321,63.412109
3,0,16.7,401.83295,4.167050,406.00000,0.004323,74.822291
4,0,16.7,401.83295,4.167050,406.00000,0.004324,112.223246
...,...,...,...,...,...,...,...
130984,3,11.7,237.64468,50.380424,288.02512,0.004612,278.553621
130985,3,11.7,233.07228,46.717087,279.78937,0.004614,256.755923
130986,3,11.7,233.07228,46.717087,279.78937,0.004618,238.477978
130987,3,11.7,233.18828,47.159560,280.34784,0.004620,226.342325


In [7]:
df_main.groupby('Surge').count()

Unnamed: 0_level_0,rgiid,x,y,Slope,Name,bed,thickness,surface_elevation,ELA,beta,cmb,width_centerline
Surge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,104245,104245,104245,104245,60242,99044,99044,99044,104079,104079,104079,104097
3,11431,11431,11431,11431,11350,11082,11082,11082,11421,11421,11421,11429


In [54]:
df['Surge'] = df['Surge'].replace({3: 1})
df = df.loc[df['Surge'].isin([0, 1])].copy()
df = df.rename(columns = {'Surge': 'label'})
df['label'].value_counts()

0    98777
1    11073
Name: label, dtype: int64

In [9]:
df['label']

0         0
1         0
2         0
3         0
4         0
         ..
130984    1
130985    1
130986    1
130987    1
130988    1
Name: label, Length: 109850, dtype: int64

In [55]:
# Separate majority and minority classes
df_majority = df[df['label']==0]
df_minority = df[df['label']==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=11073,     # to match minority class
                                 random_state=RSEED) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled['label'].value_counts()
df = df_downsampled

In [56]:
df_left = pd.concat([df_majority,df_majority_downsampled]).drop_duplicates(keep=False)
df_left.columns

Index(['label', 'Slope', 'bed', 'thickness', 'surface_elevation', 'cmb',
       'width_centerline'],
      dtype='object')

In [57]:
test_left_y = df_left['label']
test_left_x = df_left.drop(columns='label')
test_left_x

Unnamed: 0,Slope,bed,thickness,surface_elevation,cmb,width_centerline
0,16.7,401.83295,4.167050,406.00000,0.004318,86.307950
1,16.7,401.83295,4.167050,406.00000,0.004320,52.001931
3,16.7,401.83295,4.167050,406.00000,0.004323,74.822291
4,16.7,401.83295,4.167050,406.00000,0.004324,112.223246
6,16.7,366.41766,8.582350,375.00000,0.004328,203.181864
...,...,...,...,...,...,...
130865,10.6,232.02019,15.078909,247.09909,0.004145,554.550270
130867,10.6,232.02019,15.078909,247.09909,0.004180,521.463923
130868,10.6,218.39961,19.980356,238.37997,0.004197,498.860288
130869,10.6,218.39961,19.980356,238.37997,0.004218,474.359352


In [43]:
test_left_y

0         0
1         0
3         0
4         0
6         0
         ..
130865    0
130867    0
130868    0
130869    0
130871    0
Name: label, Length: 87704, dtype: int64

In [58]:
df['label'].value_counts()

1    11073
0    11073
Name: label, dtype: int64

In [10]:
df.columns

Index(['label', 'Slope', 'bed', 'thickness', 'surface_elevation', 'cmb',
       'width_centerline'],
      dtype='object')

In [59]:
# Extract the labels
labels = np.array(df.pop('label'))

# 30% examples in test data
train, test, train_labels, test_labels = train_test_split(df, labels, 
                                                          stratify = labels,
                                                          test_size = 0.3, 
                                                          random_state = RSEED)

In [61]:
train.shape

(15502, 6)

In [34]:
test.shape

(6644, 6)

In [62]:
test_left_x.shape

(87704, 6)

In [22]:
# Features for feature importances
features = list(train.columns)
features

['Slope', 'bed', 'thickness', 'surface_elevation', 'cmb', 'width_centerline']

In [17]:
train.shape

(15502, 6)

In [18]:
train_labels.shape

(15502,)

In [19]:
test.shape

(6644, 6)

### Evaluate the decision tree

In [65]:
def evaluate_model(predictions, probs, train_predictions, train_probs, test_labels):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
    baseline = {}
    
    baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))])
    baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))])
    baseline['roc'] = 0.5
    
    results = {}
    
    results['recall'] = recall_score(test_labels, predictions)
    results['precision'] = precision_score(test_labels, predictions)
    results['roc'] = roc_auc_score(test_labels, probs)
    
    train_results = {}
    train_results['recall'] = recall_score(train_labels, train_predictions)
    train_results['precision'] = precision_score(train_labels, train_predictions)
    train_results['roc'] = roc_auc_score(train_labels, train_probs)
    
    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
    
    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');

### Random forest

In [22]:
df.columns

Index(['Slope', 'bed', 'thickness', 'surface_elevation', 'cmb',
       'width_centerline'],
      dtype='object')

In [66]:
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=1000, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               max_depth = 2,
                               n_jobs=-1, verbose = 1)

# Fit on training data
model.fit(train, train_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.7s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=50, verbose=1,
                       warm_start=False)

In [25]:
n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 7
Average maximum depth 2


In [28]:
test.shape

(6644, 6)

In [67]:
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.4s finished


In [68]:
rf_predictions = model.predict(test_left_x)
rf_probs = model.predict_proba(test_left_x)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.7s finished


In [75]:
1-rf_predictions.sum()/rf_predictions.shape

array([0.77104807])

# Gradient boosting

In [79]:
import xgboost
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [80]:
dmatrix_train = xgboost.DMatrix(train, label=train_labels)
dmatrix_test = xgboost.DMatrix(test_left_x, label=test_left_y)

In [81]:
params = {'objective':'reg:logistic', 'num_boost_round':20000, 'max_depth':1, 'scale_pos_weight':0.3}

xgb = xgboost.train(params, dmatrix_train, num_boost_round=20000)

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [82]:
predictions_xgboost = xgb.predict(dmatrix_test)
pd.DataFrame({'true':test_left_y, 'predictions':predictions_xgboost})
# pd.DataFrame({'true':y_test, 'predictions':predictions}).to_csv('/home/colili/Documents/PhD/project_john/DataFrame_produced/df_data_roc.csv')

Unnamed: 0,true,predictions
0,0,0.001066
1,0,0.001066
3,0,0.001066
4,0,0.001556
6,0,0.001595
...,...,...
130865,0,0.081075
130867,0,0.063867
130868,0,0.048131
130869,0,0.053363


In [84]:
1-predictions_xgboost.sum()/predictions_xgboost.shape

array([0.87653114])

Theory: https://towardsdatascience.com/introduction-to-logistic-regression-66248243c148

Example: https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

# Logistic regression

In [85]:
from sklearn.linear_model import LogisticRegression

In [86]:
logreg = LogisticRegression()
logreg.fit(train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
predictions_LR = logreg.predict(test_left_x)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_left_x, test_left_y)))

Accuracy of logistic regression classifier on test set: 0.73


In [88]:
1-predictions_LR.sum()/predictions_LR.shape

array([0.73496078])