In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from time import ctime
from time import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#upload the dataset
data = np.loadtxt('covtype.data.csv',delimiter=',',dtype='str')
training_data = data[:,0:54]
labels = data[:,54]

#normalization training data
T = training_data
T = T.astype(float)
max_arrayT = np.max(T,axis = 0)
min_arrayT = np.min(T,axis = 0)
T = (T-min_arrayT)/(max_arrayT-min_arrayT)

# stats arr for calculating average score over all folds
accuracyArr = np.zeros(10)
precisionArr = np.zeros(10)
recallArr = np.zeros(10)
f1Arr = np.zeros(10)

# accumulate the runtime of every fold
operaTime = 0
k = 0

# construct the RF classifier
clf = RandomForestClassifier(max_features='sqrt', n_estimators=20, random_state=0, n_jobs = -1)

# define a stratified k-fold function to split 10 fold
skf = StratifiedKFold(n_splits=10)

for train, test in skf.split(T, labels):
    startTime = time()
    print('start ', k, 'fold at ', ctime(startTime))

    # training the RF model
    clf.fit(T[train], labels[train])

    # predict label of each test data
    predicted = clf.predict(T[test])

    # get probability of target label
    predicted_prob = clf.predict_proba(T[test])

    # get accuracy value
    accuracy = accuracy_score(labels[test], predicted)

    # get precision,recall,fscore,support value
    precision, recall, fscore, support = precision_recall_fscore_support(labels[test], predicted)

    # draw decent confusion matrix
    print(confusion_matrix(labels[test], predicted))
    print(classification_report(labels[test], predicted))

    # get average score based on all label types
    accuracyArr[k] = np.average(accuracy)
    precisionArr[k] = np.average(precision)
    recallArr[k] = np.average(recall)
    f1Arr[k] = np.average(fscore)
    
    endTime = time()
    print('end ', k, 'fold at', ctime(endTime))
    print('\n')
    interval = endTime - startTime
    operaTime += interval
    k += 1

print('-------Average Score---------')
print('accuracy: ', np.average(accuracyArr))
print('precision: ', np.average(precisionArr))
print('recall: ', np.average(recallArr))
print('fscore: ', np.average(f1Arr))
print('operaTime:', operaTime)



start  0 fold at  Tue May 29 01:41:21 2018
[[ 8865 12261     0     0    52     0     6]
 [ 1075 24618     4     0  2631     3     0]
 [    0   131  3094    10     2   339     0]
 [    0     0    94   171     0    10     0]
 [   25   167     0     0   754     4     0]
 [    1    64   118    10     1  1543     0]
 [  102     9     0     0     0     0  1940]]
             precision    recall  f1-score   support

          1       0.88      0.42      0.57     21184
          2       0.66      0.87      0.75     28331
          3       0.93      0.87      0.90      3576
          4       0.90      0.62      0.73       275
          5       0.22      0.79      0.34       950
          6       0.81      0.89      0.85      1737
          7       1.00      0.95      0.97      2051

avg / total       0.77      0.71      0.70     58104

end  0 fold at Tue May 29 01:41:37 2018
start  1 fold at  Tue May 29 01:41:37 2018
[[ 7066 14065     0     0    13     0    40]
 [ 6701 21546     0     0    83  