In [None]:
# Source: https://github.com/WillKoehrsen/Machine-Learning-Projects/blob/master/Random%20Forest%20Tutorial.ipynb

# Import statements
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.model_selection import RandomizedSearchCV

# Set random seed to ensure reproducible runs
RSEED = 50

In [None]:
# Read the data (Rename file input)
df = pd.read_csv('ft_data.csv')

# Select only columns that are numerical
df = df.select_dtypes('number')

# Count the label distribution
df['isFinTech'].value_counts()

In [None]:
# Optional: If the data is not clean

# Remove columns with missing values
df = df.drop(columns=['List of column names here'])

In [None]:
# Extract the labels
labels = np.array(df.pop('isFinTech'))

# 30% examples in test data
train, test, train_labels, test_labels = train_test_split(df, labels,
                                                          stratify=labels,
                                                          test_size=0.3,
                                                          random_state=RSEED)

In [None]:
# Create the Random Forest model with 100 trees
model = RandomForestClassifier(n_estimators=100,
                               random_state=RSEED,
                               max_features='sqrt',
                               n_jobs=-1, verbose=1)

# Fit on training data
model.fit(train, train_labels)

In [None]:
# Results
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

rf_predictions = model.predict(test)
rf_probs = model.predict_proba(test)[:, 1]

In [None]:
"""Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""


def evaluate_model(predictions, probs, train_predictions, train_probs):

    baseline = {}

    baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))])
    baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))])
    baseline['roc'] = 0.5

    results = {}

    results['recall'] = recall_score(test_labels, predictions)
    results['precision'] = precision_score(test_labels, predictions)
    results['roc'] = roc_auc_score(test_labels, probs)

    train_results = {}
    train_results['recall'] = recall_score(train_labels, train_predictions)
    train_results['precision'] = precision_score(train_labels, train_predictions)
    train_results['roc'] = roc_auc_score(train_labels, train_probs)

    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')

    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16

    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')

In [None]:
# ROC curve
evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs)

In [None]:
"""
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

In [None]:
# Confusion Matrix
cm = confusion_matrix(test_labels, rf_predictions)
plot_confusion_matrix(cm, classes = ['Not FinTech', 'Is FinTech'],
                      title = 'FinTech Confusion Matrix')

In [None]:
# Hyperparameter tuning

# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier(random_state = RSEED)

# Create the random search model
rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1,
                        scoring = 'roc_auc', cv = 5,
                        n_iter = 10, verbose = 1, random_state=RSEED)

# Fit
rs.fit(train, train_labels)

In [None]:
# See the best parameters
rs.best_params_

In [None]:
# Use the best model
best_model = rs.best_estimator_

In [None]:
# Train the best model
train_rf_predictions = best_model.predict(train)
train_rf_probs = best_model.predict_proba(train)[:, 1]

rf_predictions = best_model.predict(test)
rf_probs = best_model.predict_proba(test)[:, 1]

In [None]:
# ROC curve of best model
evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs)

In [None]:
# Confusion Matrix of best model
cm = confusion_matrix(test_labels, rf_predictions)
plot_confusion_matrix(cm, classes = ['Not FinTech', 'Is FinTech'],
                      title = 'FinTech Confusion Matrix')

In [None]:
# Feature Selection (Find most important features)

# Extract feature importances
fi = pd.DataFrame({'feature': list(train.columns),
                   'importance': best_model.feature_importances_}).\
                    sort_values('importance', ascending = False)

# Display
fi