## Classifier Setups (Sklearn Library)*

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [None]:
'''Runs the model specified by the clf (classifier), and features.
Features is assumed to have an 'is_expert' column which gives the classifications'''
def run_model(clf, features):
    train, test = train_test_split(X)
    train_classifier = train['is_expert'].values
    test_classifier = test['is_expert'].values
    train = train.drop('is_expert',axis=1)
    test = test.drop('is_expert', axis=1)
    clf.fit(train, train_classifier)
    rf_pred = clf.predict(test)
    model_perf= {'Model_Score' : clf.score(test, test_classifier),
                'Predictions' : rf_pred,
                'Actual' : test_classifier,
                'Prediction_Probabilities' : clf.predict_proba(test),
                'Total_Tested' : len(rf_pred),
                'Num_Experts_Predicted' : sum(rf_pred),
                'Num_Experts_Actual' : sum(test_classifier),
                'Num_Experts_Training' : sum(train_classifier)} 
    return model_perf

'''Runs the model n times, and prints out a dictionary with the statistics'''
def bootstrap_model(clf, features, n):
    models = []
    for i in range(1,n):
        models.append(run_model(clf, features))
    return models

'''Gets statistics from the bootstrap list of dictionaries'''
def boot_statistics(models):
    stats = []
    mean_model_score = np.mean([i['Model_Score'] for i in models])
    expert_pred_percentage = [i['Num_Experts_Predicted']/i['Num_Experts_Actual'] for i in models]
    mn = np.array((1.0 - np.array(mean_model_score))) * np.array(models[0]['Total_Tested'])
    stats = {'Mean_Wrong_Predictions' : round(mn),
             'Mean_Model_Score' : mean_model_score, 
             'Expert_Prediction_Percentage' : expert_pred_percentage}
    return stats

def graph_from_statistics(stats):
    return 0

## Run a random forest classifier

In [None]:
clf_RF = RandomForestClassifier(max_depth=3)

display(run_model(clf_RF,X))
display(run_model(clf_RF,X2))

RF_bootstrap = bootstrap_model(clf_RF, X, 10)
RF2_bootstrap = bootstrap_model(clf_RF, X2, 10)

## Run a gaussian naive bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()

display(run_model(clf_NB,X))
display(run_model(clf_NB,X2))

NB_bootstrap = bootstrap_model(clf_NB, X, 10)
NB2_bootstrap = bootstrap_model(clf_NB, X2, 10)

## Run a decision tree classifier

In [None]:
clf_DT = tree.DecisionTreeClassifier()

display(run_model(clf_DT, X))
display(run_model(clf_DT, X2))

DT_bootstrap = bootstrap_model(clf_DT, X, 10)
DT2_bootstrap = bootstrap_model(clf_DT, X2, 10)

## Statistics

In [None]:
DT_stats = boot_statistics(DT_bootstrap)
NB_stats = boot_statistics(NB_bootstrap)
RF_stats = boot_statistics(RF_bootstrap)
#
DT2_stats = boot_statistics(DT2_bootstrap)
NB2_stats = boot_statistics(NB2_bootstrap)
RF2_stats = boot_statistics(RF2_bootstrap)

display(DT_stats, DT2_stats)
display(NB_stats, DT2_stats)
display(RF_stats, RF2_stats)
y = [DT_stats['Mean_Model_Score'],NB_stats['Mean_Model_Score'],RF_stats['Mean_Model_Score']]
N = len(y)
x = range(N)
width = 1/1.5
plt.bar(x, y, width)

fig = plt.gcf()