### Dataset 1

In [9]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
import timeit
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

In [10]:
## Reading the file
def read_file(trainF,testF, Directory):
    #Directory = "./Data Set 1/splits/"
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    return train, test

In [11]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [12]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    RF = RandomForestClassifier(n_estimators=10)
    RF.fit(X,y)
    y_pred = RF.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = RF.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [13]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [14]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [15]:
def model_build(filenum,Target_column, df_train, df_test, Directory,pos_label=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    Target_col = Target_column
    elapsed_time_svm = [];elapsed_time_RF = []; elapsed_time_log = []; elapsed_time_tree = []
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory)
        accuracy_svm, roc_auc_svm, elapsed = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col,pos_label)
        elapsed_time_svm.append(elapsed)
        accuracy_RF, roc_auc_RF, elapsed = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col,pos_label)
        elapsed_time_RF.append(elapsed)
        accuracy_log, roc_auc_log, elapsed = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col,pos_label)
        elapsed_time_log.append(elapsed)
        accuracy_tree, roc_auc_tree, elapsed = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col,pos_label)
        elapsed_time_tree.append(elapsed)
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Individual file accuracy for SVM')
    print(np.array(accuracy_svm))
    print('Individual time taken for SVM')
    print(np.array(elapsed_time_svm))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('Individual file AUC for SVM')
    print(np.array(roc_auc_svm))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()
    print('********** RF classifier ************')
    print('Individual file accuracy for RF')
    print(np.array(accuracy_RF))
    print('Individual time taken for RF')
    print(np.array(elapsed_time_RF))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('Individual file AUC for RF')
    print(np.array(roc_auc_RF))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    print('********** Logistic regression ******')
    print('Individual file accuracy for log')
    print(np.array(accuracy_log))
    print('Individual time taken for log')
    print(np.array(elapsed_time_log))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('Individual file AUC for log')
    print(np.array(roc_auc_log))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Individual file accuracy for Tree')
    print(np.array(accuracy_tree))
    print('Individual time taken for tree')
    print(np.array(elapsed_time_tree))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('Individual file AUC for tree')
    print(np.array(roc_auc_tree))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()

In [16]:
model_build(filenum=1,Target_column='Occupancy', df_train='d1_train', df_test='d1_test', Directory = "./Data Set 1/splits/")

Data set# 1
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.8736428   0.8814401   0.87593867  0.86993769  0.88263208  0.87268232
  0.87398061  0.88678665  0.88199721  0.87618751]
Individual time taken for SVM
[ 0.29292304  0.32001171  0.3437535   0.29957787  0.34701644  0.32826098
  0.31966322  0.30593459  0.31304262  0.35576448]
Accuracy mean   Accuracy Stdev  
0.877522564983 0.00510046226089
Individual file AUC for SVM
[ 0.99547189  0.99488438  0.99553171  0.99626505  0.99611767  0.9963229
  0.99604615  0.99555787  0.99604224  0.99549293]
AUC mean        AUC      Stdev  
0.99577327906 0.00043285710704

********** RF classifier ************
Individual file accuracy for RF
[ 0.99260425  0.99472377  0.99389862  0.99361371  0.99351471  0.99521014
  0.99461456  0.99400092  0.99488293  0.9936147 ]
Individual time taken for RF
[ 0.00212322  0.00144653  0.00189488  0.00143485  0.00145849  0.00220889
  0.00160531  0.0014357   0.00148468  0.00172791]
Accuracy mean   

In [17]:
model_build(filenum=3,Target_column='Class', df_train='d3_train', df_test='d3_test', Directory = "./Data Set 3/splits/",pos_label=2)

Data set# 3
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.65435127  0.65266598  0.64939317  0.65668568  0.6514536   0.65125684
  0.64670517  0.64893702  0.65464289  0.65504061]
Individual time taken for SVM
[ 1.6998123   1.51761068  1.64018898  1.63628808  1.71953106  1.69860938
  1.68542427  1.94118177  1.71652121  1.95350786]
Accuracy mean   Accuracy Stdev  
0.652113220864 0.00298484648335
Individual file AUC for SVM
[ 0.62705539  0.61968957  0.62179789  0.62347767  0.64276567  0.62284399
  0.62641565  0.629252    0.63938789  0.62076662]
AUC mean        AUC      Stdev  
0.62734523341 0.00745170324

********** RF classifier ************
Individual file accuracy for RF
[ 0.8625      0.85598151  0.85691472  0.85852345  0.85527678  0.85909127
  0.85602405  0.86089657  0.85811405  0.8606466 ]
Individual time taken for RF
[ 0.01270434  0.01192294  0.01528391  0.01412157  0.0106735   0.01248976
  0.01200571  0.01787449  0.01131527  0.01322512]
Accuracy mean   Ac

In [18]:
model_build(filenum=5,Target_column='y', df_train='d5_train', df_test='d5_test', Directory = "./Data Set 5/splits/",pos_label=None)

Data set# 5
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.88200946  0.88379154  0.88233526  0.88332886  0.88207405  0.8854829
  0.88086147  0.88217422  0.88371859  0.88466821]
Individual time taken for SVM
[ 26.60743453  27.99805965  21.86088418  25.85922955  18.96124999
   9.81869742   8.24934053  11.24936642  14.88807869  10.01632473]
Accuracy mean   Accuracy Stdev  
0.883044455408 0.00133096376778
Individual file AUC for SVM
[ 0.64840417  0.63838293  0.64315401  0.64980309  0.64059433  0.64661025
  0.64341282  0.6451638   0.65032611  0.64345783]
AUC mean        AUC      Stdev  
0.644930932766 0.0037026777384

********** RF classifier ************
Individual file accuracy for RF
[ 0.89613745  0.8990391   0.89769881  0.89793794  0.89833395  0.89963858
  0.89620177  0.8978999   0.8992854   0.90021103]
Individual time taken for RF
[ 0.01463502  0.01453132  0.01486045  0.01334024  0.00553228  0.00553339
  0.00500691  0.00628879  0.0061493   0.00527114]
Accura

In [19]:
model_build(filenum=9,Target_column='Income level', df_train='d9_train', df_test='d9_test', Directory = "./Data Set 9/splits/",pos_label=None)

Data set# 9
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.81138371  0.81308733  0.81314879  0.81264717  0.81048567  0.81048567
  0.80903491  0.81437599  0.81320708  0.80897935]
Individual time taken for SVM
[ 0.61258469  0.6592093   0.60637513  0.74763967  0.77706164  0.70739482
  0.67781428  0.72216309  0.72685723  0.68023342]
Accuracy mean   Accuracy Stdev  
0.81168356612 0.00178617472735
Individual file AUC for SVM
[ 0.80556115  0.8050462   0.8011181   0.80566333  0.80115736  0.80115715
  0.79518382  0.8161446   0.81046508  0.80113771]
AUC mean        AUC      Stdev  
0.804263448802 0.00552519942799

********** RF classifier ************
Individual file accuracy for RF
[ 0.80534838  0.81476015  0.80726644  0.81259812  0.80617852  0.80885192
  0.80419478  0.80731635  0.80986793  0.80893007]
Individual time taken for RF
[ 0.00283786  0.00299946  0.00436031  0.00338842  0.00347152  0.0033285
  0.00440038  0.00365684  0.00378482  0.00351458]
Accuracy mean   

In [20]:
model_build(filenum=11,Target_column='C7130', df_train='amlall_train', df_test='amlall_test', Directory = "./data11_amlalll/",pos_label='AML')

Data set# 11
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.77272727  0.65384615  0.73529412  0.58333333  0.6         0.56521739
  0.68        0.55172414  0.62962963  0.62962963]
Individual time taken for SVM
[ 0.00370181  0.00255842  0.0019647   0.00241227  0.00255957  0.0023429
  0.00225786  0.00205327  0.00286584  0.00213713]
Accuracy mean   Accuracy Stdev  
0.640140166605 0.0684837819889
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 0.86363636  0.84615385  0.91176471  0.79166667  0.9         0.95652174
  0.8         0.86206897  0.85185185  0.74074074]
Individual time taken for RF
[ 0.0004146   0.00046441  0.00101202  0.00046765  0.0004487   0.00040576
  0.00039437  0.00041237  0.0010299   0.00039276]
Accuracy mean   Accuracy Stdev  
0.852440487958 0.0598267466052
Individual file AUC for RF
[ 0.94705882  0.947

In [21]:
model_build(filenum=13,Target_column='C7130', df_train='central_train', df_test='central_test', Directory = "./data13_central/",pos_label='Class1')

Data set# 13
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.76470588  0.80952381  0.65217391  0.61538462  0.58333333  0.47619048
  0.66666667  0.35294118  0.72222222  0.4375    ]
Individual time taken for SVM
[ 0.00184036  0.00163716  0.00156921  0.00211882  0.00151224  0.00174115
  0.00213966  0.00179447  0.00178706  0.00221335]
Accuracy mean   Accuracy Stdev  
0.608064209519 0.139917957238
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 0.58823529  0.61904762  0.52173913  0.53846154  0.58333333  0.52380952
  0.58333333  0.29411765  0.77777778  0.5       ]
Individual time taken for RF
[ 0.00052604  0.00052254  0.00047519  0.00058135  0.0004707   0.00047889
  0.00050162  0.00078963  0.00048923  0.00056918]
Accuracy mean   Accuracy Stdev  
0.552985519737 0.113946851135
Individual file AUC for RF
[ 0.60576923  0.6470

In [25]:
model_build(filenum=16,Target_column='TARGET', df_train='d16_original_train', df_test='d16_original_test', Directory = "./Original Dataset_16_Prostrate/",pos_label=None)

Data set# 16
********** SVM classifier ***********
Individual file accuracy for SVM
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
Individual time taken for SVM
[ 0.00221821  0.00204537  0.00211864  0.0023119   0.00219617  0.00235398
  0.00207669  0.00238881  0.00316864  0.0022302 ]
Accuracy mean   Accuracy Stdev  
1.0 0.0
Individual file AUC for SVM
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
AUC mean        AUC      Stdev  
1.0 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 0.76470588  0.71428571  0.79411765  0.92857143  0.82857143  0.8
  0.84210526  0.69387755  0.86486486  0.73684211]
Individual time taken for RF
[ 0.00055629  0.0005309   0.00050978  0.00051811  0.00051181  0.00053485
  0.00052556  0.00119328  0.0009652   0.00052738]
Accuracy mean   Accuracy Stdev  
0.796794188515 0.0686313192496
Individual file AUC for RF
[ 0.89583333  0.81610577  0.91145833  0.98717949  0.87666667  0.87333333
  0.91111111  0.91555184  0.89285714  0.83096591]
AUC mean        