# Initial Configuration

In [174]:
import os
from IPython import display
import yfinance as yf
import datetime as dt
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
from pandas import read_csv
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Import the data from the CSV into a dataframe

In [175]:
df = pd.read_csv("e1_positive.csv")
df

Unnamed: 0,GABRG2,CELF4,SRRM4,SLC1A3,ATP1A3,RBFOX3,GABRA4,NHSL1,GRAMD3,SEZ6L2,...,FERMT1,CSPG4,GJA1,LAMA1,YAP1,LINC00639.2,SMOC1,LINC00498,GFRA1,Label
0,35.038262,161.176004,68.074337,58.063405,20.021864,269.294069,188.205520,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.001093,0.0,0.0,1
1,95.324867,75.256474,87.297510,0.000000,18.061554,342.166102,683.328784,0.000000,0.0,1.003420,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.003420,0.0,0.0,1
2,220.143867,187.976727,42.219372,106.553653,0.000000,187.976727,299.556496,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,7.036562,0.000000,0.0,0.0,1
3,166.010840,26.159284,61.373704,0.000000,30.183789,254.549955,446.720079,0.000000,0.0,25.153158,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.006126,0.0,0.0,1
4,188.426220,71.160966,119.269788,57.129226,16.036274,265.600789,287.650666,24.054411,0.0,1.002267,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,665.606813,101.002551,4.040102,0.000000,95.952424,10.100255,79.792016,0.000000,0.0,192.914873,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.010026,0.0,0.0,0
867,21.157688,45.337903,18.135161,1.007509,290.162582,14.105125,0.000000,3.022527,0.0,64.480574,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0
868,4.111466,81.201453,1704.202638,2.055733,9.250798,75.034254,238.465025,1.027866,0.0,1.027866,...,0.0,0.0,0.0,0.0,0.0,1.027866,0.000000,0.0,0.0,0
869,37.480576,438.624037,0.000000,0.000000,69.896209,3.038966,66.857244,0.000000,0.0,130.675522,...,0.0,0.0,0.0,0.0,0.0,0.000000,252.234146,0.0,0.0,0


In [176]:
# Dataset Characteristics

In [177]:
print('Number of 0 labels: ', len(df[df.Label==0]))
print('Number of 1 labels: ', len(df[df.Label==1]))

Number of 0 labels:  572
Number of 1 labels:  299


# splitting data into features and labels
### Dropping the labels for our feature matrix

In [178]:
y = df.iloc[:, x.shape[1]].values
x = df.drop(['Label'], axis=1)

# Split the data into trainning set and testing set
### Supervised Learning

In [179]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.25,
    #random_state=0
    shuffle=True
)

# Creating the Random Forest Classifier

In [180]:
model = RandomForestClassifier (
    n_estimators=500,
    max_features=6,
    bootstrap=True,
    random_state=42,
    #max_samples=100,
    #min_samples_split=2,
    #min_samples_leaf=1,
    #max_depth=10,
)

# Hyperparameter tuning

The hyperparameters in the random forest model are either used to increase the predictive power of the model or to make the model faster. 

# Cross Validation

| n_estimators      | number of trees in the forest 
|-------------------|:------------------------------
| max_depth         | maximum depth in a tree
| min_samples_split | minimum number of samples to allow a split in an internal node
| min_samples_leaf  | specifies the minimum number of samples required to be at a leaf node
| bootstrap         |Bootstrap=True (default): samples are drawn with replacement Bootstrap=False : samples are drawn without replacement
| random_state  | generated random numbers for the random forest.

In [181]:
grid_ranges = {
    'n_estimators': [500, 1000],
    'max_features': np.arange(1, 15, 1),
    'bootstrap': [True],
    #'max_samples': [100] # will take n samples
    #'max_depth': np.arange(1, 15, 1),
    #'min_samples_split': [2, 10, 9],
    #'min_samples_leaf': np.arange(1, 15, 2, dtype=int),
    #'random_state': [1, 2, 30, 42],
}
gscv = GridSearchCV(
    estimator=model,
    param_grid=grid_ranges,
    cv=3,
    n_jobs=-1,
    verbose=1,
)
gscv_fit = gscv.fit(x_train, y_train)
best_parameters = gscv_fit.best_params_
print(best_parameters)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
{'bootstrap': True, 'max_features': 5, 'n_estimators': 500}


# Recreate the Random Forest Model with the best found hyperparameters

In [182]:
model = RandomForestClassifier(
    n_estimators=best_parameters['n_estimators'],
    bootstrap=best_parameters['bootstrap'],
    #max_depth=best_parameters['max_depth'],
    #max_samples=100
)
model.fit(x_train, y_train)

# Now running the model on the test set with threshold/cutoff of 0.5%

In [183]:
predict = model.predict(x_test)

# Confusion Matrix with threshold of 0.5%

In [184]:
M = confusion_matrix(y_test, predict)
print('true negatives, false negatives')
M

true negatives, false negatives


array([[143,   1],
       [  0,  74]])

# Setting threshold to 0.7% yields better results?

In [185]:
threshold = 0.7
y_pred_threshold = (model.predict_proba(x_test)[:, 1] > threshold).astype('float')
confusion_matrix(y_test, y_pred_threshold)

array([[144,   0],
       [  0,  74]])

# Metrics

In [186]:
ac = accuracy_score(predict, y_test)
print('Accuracy is:', ac)

Accuracy is: 0.9954128440366973
