In [1]:
'''
Using scikit-learn library to generate random forest evaluator
w/ custom k-fold (bagged samples) cross-validation
'''

#import libraries
import sklearn.ensemble as ensemble
import sklearn.datasets as datset
import pandas as pd
import numpy as np

#load up CSV into dataframe
dataMatrix = pd.read_csv('../../brafsplit2.csv', index_col=0)
dataMatrix.loc[dataMatrix['class'] == 2.0, 'class'] = -1.0

#iris = datset.load_iris()
#make a pandas dataframe w/ (sample,feature) format
#dataMatrix = pd.DataFrame(iris.data, columns=iris.feature_names)
#view top 5 rows
len(dataMatrix.index)

32

In [2]:
#perform a k-fold split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from math import sqrt
#from sklearn.model_selection import cross_val_predict

array = dataMatrix.values

numcols = len(array[0]) - 1

x = array[:,0:numcols]
y = array[:,numcols]

#set params for RF
#seed = 7
numtrees = 1000
numfeats = len(dataMatrix.columns) - 1
numsamps = len(dataMatrix)
maxfeats = int(sqrt(numfeats))+1 #randomly select features of size sqrt(num features)

kf = KFold(n_splits=10)
#kf.get_n_splits(x)

y

array([-1., -1., -1.,  1., -1., -1., -1.,  1.,  1., -1., -1., -1.,  1.,
        1.,  1., -1.,  1., -1., -1.,  1.,  1., -1.,  1., -1., -1., -1.,
        1.,  1.,  1.,  1.,  1.,  1.])

In [3]:
import random

#make our own custom cross validation w 2d matrix
indices = []

#get random sampling of training indices...
for trainIndex, testIndex in kf.split(x):
    newTrainIndex = []
    for j in range(0,len(trainIndex)):
        chosenCol = random.randint(0,len(trainIndex)-1) #pick random trainIndex index
        newTrainIndex.append(trainIndex[chosenCol])
    trainIndex = newTrainIndex
    myTuple = (trainIndex,testIndex)
    indices.append(myTuple)
    #print("TRAIN:", trainIndex, "TEST:", testIndex)
    
#note, this is before bagging
#notice how in the testing sets, each sample is seen exactly once
indices

[([24,
   28,
   31,
   19,
   31,
   31,
   11,
   10,
   17,
   29,
   30,
   16,
   29,
   19,
   10,
   17,
   30,
   13,
   11,
   17,
   27,
   27,
   7,
   11,
   18,
   8,
   15,
   6],
  array([0, 1, 2, 3])),
 ([11,
   20,
   31,
   18,
   14,
   16,
   14,
   19,
   14,
   26,
   19,
   20,
   20,
   10,
   21,
   31,
   17,
   14,
   17,
   3,
   21,
   17,
   2,
   10,
   14,
   24,
   3,
   21],
  array([4, 5, 6, 7])),
 ([26,
   11,
   22,
   18,
   7,
   26,
   13,
   2,
   19,
   2,
   17,
   16,
   2,
   24,
   27,
   5,
   17,
   7,
   7,
   17,
   25,
   26,
   19,
   28,
   12,
   6,
   16,
   18,
   4],
  array([ 8,  9, 10])),
 ([28,
   31,
   0,
   1,
   23,
   26,
   2,
   18,
   21,
   24,
   3,
   29,
   0,
   2,
   1,
   0,
   15,
   27,
   3,
   25,
   7,
   23,
   28,
   25,
   26,
   17,
   26,
   3,
   15],
  array([11, 12, 13])),
 ([20,
   22,
   5,
   27,
   17,
   4,
   31,
   4,
   21,
   8,
   8,
   25,
   10,
   3,
   13,
   28,
   0,
   7,
   24,
   

In [4]:
from sklearn.datasets import make_classification

#need to fit the model, so describe the classification task first
J,k = make_classification(
    n_samples = numsamps,
    n_features = numfeats,
    n_redundant = 0,
    n_repeated = 0,
    n_classes = 2,
    random_state = 0,
    shuffle = False
    )

#create/train our model
RFmodel = ensemble.RandomForestClassifier(n_estimators = numtrees, max_features = maxfeats, criterion = 'gini', n_jobs = -1)

RFmodel.fit(J,k)

importances = RFmodel.feature_importances_

results = cross_val_score(RFmodel, x, y, cv = indices)

for z in range (1, len(results)+1):
    print("Accuracy of fold " + str(z) + ": " + str(results[z-1]))

print("\n\nAvg accuracy across folds: " + str(results.mean()))

Accuracy of fold 1: 0.25
Accuracy of fold 2: 0.0
Accuracy of fold 3: 0.666666666667
Accuracy of fold 4: 0.666666666667
Accuracy of fold 5: 0.666666666667
Accuracy of fold 6: 1.0
Accuracy of fold 7: 0.666666666667
Accuracy of fold 8: 0.666666666667
Accuracy of fold 9: 1.0
Accuracy of fold 10: 0.0


Avg accuracy across folds: 0.558333333333


In [5]:
#let's see what our top features are...
#taken from http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
std = np.std([tree.feature_importances_ for tree in RFmodel.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(J.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, dataMatrix.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature GRB14_GRCh37_2:165365288-165365296_In-Frame-Del-DEL-TTTTTTTTT---- (0.145093)
2. feature ACVR2A_GRCh37_2:148657041-148657048_Frame-Shift-Del-DEL-AAAAAAAA---- (0.030030)
3. feature PIK3CA_GRCh37_3:178952085-178952085_Missense-Mutation-SNP-A-G-G (0.029626)
4. feature MSH6_GRCh37_2:48030640-48030641_Frame-Shift-Ins-INS-----C (0.026363)
5. feature CRB1_GRCh37_1:197390428-197390428_Silent-SNP-C-T-T (0.026104)
6. feature RP9_GRCh37_7:33136124-33136124_Nonsense-Mutation-SNP-G-A-A (0.024774)
7. feature LZTS2_GRCh37_10:102762593-102762593_Frame-Shift-Del-DEL-C---- (0.023170)
8. feature USP11_GRCh37_X:47100031-47100031_Missense-Mutation-SNP-G-A-A (0.023078)
9. feature CRTC1_GRCh37_19:18887993-18887993_Frame-Shift-Del-DEL-C---- (0.021183)
10. feature PPP1R12B_GRCh37_1:202407190-202407190_Intron-DEL-T---- (0.020343)
11. feature APC_GRCh37_5:112173704-112173704_Nonsense-Mutation-SNP-C-T-T (0.020065)
12. feature UGT2A1_GRCh37_4:70513073-70513073_Frame-Shift-Del-DEL-T---- (

In [6]:
import matplotlib.pyplot as plt
#Plot the feature importances of the forest
#taken from http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
plt.figure(figsize=(20,10))
'''
plt.title("Feature importances")
plt.bar(range(J.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(J.shape[1]), indices)
plt.xlim([-1, J.shape[1]])
plt.show()


plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()
'''

'\nplt.title("Feature importances")\nplt.bar(range(J.shape[1]), importances[indices],\n       color="r", yerr=std[indices], align="center")\nplt.xticks(range(J.shape[1]), indices)\nplt.xlim([-1, J.shape[1]])\nplt.show()\n\n\nplt.title(\'Feature Importances\')\nplt.barh(range(len(indices)), importances[indices], color=\'b\', align=\'center\')\nplt.yticks(range(len(indices)), features[indices])\nplt.xlabel(\'Relative Importance\')\nplt.show()\n'

In [7]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y,predicted)


NameError: name 'predicted' is not defined