In [17]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import random
import copy
from sklearn.model_selection import RandomizedSearchCV
import pprint

In [7]:
df = pd.read_csv("success_encoded_data.csv")
success_df = df[df['target_variable'] == 1]
failure_df = df[df['target_variable'] == 0].sample(n = success_df.shape[0], replace = False)
balanced_df = pd.concat([success_df,failure_df])
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
balanced_df["target_variable"] = balanced_df["target_variable"].astype(int)
balanced_df.drop('company_name', axis=1, inplace=True)
y = balanced_df['target_variable']
balanced_df.drop('target_variable', axis=1, inplace=True)
balanced_df.drop(['acquired', 'closed','ipo','operating'], axis=1,inplace=True)
X = copy.deepcopy(balanced_df)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
# random_grid = {'n_estimators': n_estimators,
              #  'max_features': max_features,
              #  'max_depth': max_depth,
              #  'min_samples_split': min_samples_split,
              #  'min_samples_leaf': min_samples_leaf,
              #  'bootstrap': bootstrap}
random_grid = {
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 1)
# Fit the random search model
rf_random.fit(X, y)

{'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5; total time=  12.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5; total time=  11.5s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5; total time=  11.8s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2; total time=   4.5s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2; total time=   4.5s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2; total time=   4.4s
[CV] END bootstrap=True, m

In [35]:
print(rf_random.best_params_)

{'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': True}


In [27]:
print("Random Search Predict")
accuracies = []
precisions = []
recalls = []
rs = random.randint(100,200)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
y_pred = pd.Series(rf_random.predict(X_test))
y_test = y_test.reset_index(drop=True)
z = pd.concat([y_test, y_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print(accuracies)
print(precisions)
print(recalls)

Random Search Predict
Accuracy: 0.7506132461161079
Precision: 0.760705289672544
Recall: 0.7359870024370431
[0.7506132461161079]
[0.760705289672544]
[0.7359870024370431]


In [32]:

############ Starting loops ############
accuracies = []
precisions = []
recalls = []
rs = random.randint(100,200)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
model = RandomForestClassifier(n_estimators=2000, 
                             bootstrap = True,
                             max_features = 'sqrt',verbose=0,max_depth=50)


In [34]:

model.fit(X_train, y_train)
print("done 2")
y_pred = pd.Series(model.predict(X_test))
y_test = y_test.reset_index(drop=True)
z = pd.concat([y_test, y_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print(accuracies)
print(precisions)
print(recalls)
print(metrics.balanced_accuracy_score(y_test, y_pred))

done 2
Accuracy: 0.7005314799672936
Precision: 0.7069249793899423
Recall: 0.6946132037262049
[0.7021668029435814, 0.7005314799672936]
[0.7090909090909091, 0.7069249793899423]
[0.6950182260024301, 0.6946132037262049]
0.70058765840458


In [15]:
accuracies = []
precisions = []
recalls = []
for i in range(5):
  print("Iteration : ",i)
  X = copy.deepcopy(balanced_df)
  rs = random.randint(100,200)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
  # model = LogisticRegressionCV(cv=5,random_state=0,verbose=0)
  model = RandomForestClassifier(n_estimators=100, 
                             bootstrap = True,
                             max_features = 'sqrt',verbose=0,max_depth=60)
  model.fit(X_train, y_train)
  print("done 2")
  y_pred = pd.Series(model.predict(X_test))
  y_test = y_test.reset_index(drop=True)
  z = pd.concat([y_test, y_pred], axis=1)
  z.columns = ['True', 'Prediction']
  z.head()
  acc = metrics.accuracy_score(y_test, y_pred)
  prec = metrics.precision_score(y_test, y_pred)
  rec = metrics.recall_score(y_test, y_pred)
  accuracies.append(acc)
  precisions.append(prec)
  recalls.append(rec)
  print("Accuracy:", acc)
  print("Precision:", prec)
  print("Recall:", rec)
  print(classification_report(y_test, y_pred))
  print(metrics.balanced_accuracy_score(y_test, y_pred))
print(accuracies)
print(precisions)
print(recalls)


Iteration :  0
done 2
Accuracy: 0.6937857726901063
Precision: 0.6980033277870217
Recall: 0.6848979591836735
              precision    recall  f1-score   support

           0       0.69      0.70      0.70      2442
           1       0.70      0.68      0.69      2450

    accuracy                           0.69      4892
   macro avg       0.69      0.69      0.69      4892
weighted avg       0.69      0.69      0.69      4892

0.6938003309431882
Iteration :  1
done 2
Accuracy: 0.6997138184791496
Precision: 0.6877022653721683
Recall: 0.7092198581560284
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      2495
           1       0.69      0.71      0.70      2397

    accuracy                           0.70      4892
   macro avg       0.70      0.70      0.70      4892
weighted avg       0.70      0.70      0.70      4892

0.6999005102403388
Iteration :  2
done 2
Accuracy: 0.7064595257563369
Precision: 0.7130977130977131
Recall: 0.6