### Dataset 1

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
import timeit
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

In [2]:
## Reading the file
def read_file(trainF,testF, Directory):
    #Directory = "./Data Set 1/splits/"
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    return train, test

In [3]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [4]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    RF = RandomForestClassifier(n_estimators=10)
    RF.fit(X,y)
    y_pred = RF.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = RF.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [5]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [6]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [7]:
def model_build(filenum,Target_column, df_train, df_test, Directory,pos_label=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    Target_col = Target_column
    elapsed_time_svm = [];elapsed_time_RF = []; elapsed_time_log = []; elapsed_time_tree = []
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory)
        accuracy_svm, roc_auc_svm, elapsed = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col,pos_label)
        elapsed_time_svm.append(elapsed)
        accuracy_RF, roc_auc_RF, elapsed = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col,pos_label)
        elapsed_time_RF.append(elapsed)
        accuracy_log, roc_auc_log, elapsed = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col,pos_label)
        elapsed_time_log.append(elapsed)
        accuracy_tree, roc_auc_tree, elapsed = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col,pos_label)
        elapsed_time_tree.append(elapsed)
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Individual file accuracy for SVM')
    print(np.array(accuracy_svm))
    print('Individual time taken for SVM')
    print(np.array(elapsed_time_svm))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('Individual file AUC for SVM')
    print(np.array(roc_auc_svm))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()
    print('********** RF classifier ************')
    print('Individual file accuracy for RF')
    print(np.array(accuracy_RF))
    print('Individual time taken for RF')
    print(np.array(elapsed_time_RF))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('Individual file AUC for RF')
    print(np.array(roc_auc_RF))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    print('********** Logistic regression ******')
    print('Individual file accuracy for log')
    print(np.array(accuracy_log))
    print('Individual time taken for log')
    print(np.array(elapsed_time_log))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('Individual file AUC for log')
    print(np.array(roc_auc_log))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Individual file accuracy for Tree')
    print(np.array(accuracy_tree))
    print('Individual time taken for tree')
    print(np.array(elapsed_time_tree))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('Individual file AUC for tree')
    print(np.array(roc_auc_tree))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()

In [8]:
model_build(filenum=1,Target_column='Occupancy', df_train='d1_train', df_test='d1_test', Directory = "./Data Set 1/splits/")

Data set# 1
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.8736428   0.8814401   0.87593867  0.86993769  0.88263208  0.87268232
  0.87398061  0.88678665  0.88199721  0.87618751]
Individual time taken for SVM
[ 0.40444969  0.37894931  0.38371351  0.32141091  0.34360043  0.37529198
  0.34400436  0.35164679  0.3696409   0.31161354]
Accuracy mean   Accuracy Stdev  
0.877522564983 0.00510046226089
Individual file AUC for SVM
[ 0.99548713  0.99440746  0.99552344  0.9962872   0.99610718  0.99636014
  0.99603169  0.99558922  0.99604224  0.99549293]
AUC mean        AUC      Stdev  
0.995732862468 0.000544519241086

********** RF classifier ************
Individual file accuracy for RF
[ 0.99370574  0.99379268  0.9928035   0.99361371  0.99209111  0.99412855
  0.99461456  0.99338563  0.99441774  0.99283601]
Individual time taken for RF
[ 0.00191378  0.0018322   0.00174384  0.00128218  0.00211838  0.00182318
  0.00189174  0.0021322   0.00182979  0.00187834]
Accuracy mean

In [9]:
model_build(filenum=3,Target_column='Class', df_train='d3_train', df_test='d3_test', Directory = "./Data Set 3/splits/",pos_label=2)

Data set# 3
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.65435127  0.65266598  0.64939317  0.65668568  0.6514536   0.65125684
  0.64670517  0.64893702  0.65464289  0.65504061]
Individual time taken for SVM
[ 1.67248871  1.82695883  1.56876782  1.37992199  1.35387393  1.44061384
  1.34838557  1.39113715  1.42259428  1.40798463]
Accuracy mean   Accuracy Stdev  
0.652113220864 0.00298484648335
Individual file AUC for SVM
[ 0.62255209  0.61704286  0.62276759  0.62773142  0.63220118  0.62284407
  0.63200137  0.62925024  0.63938794  0.62076711]
AUC mean        AUC      Stdev  
0.626654586771 0.00634740035672

********** RF classifier ************
Individual file accuracy for RF
[ 0.85973101  0.85924922  0.85587672  0.86264259  0.85774592  0.86099437
  0.85823906  0.85906521  0.86003359  0.86271699]
Individual time taken for RF
[ 0.0131448   0.01206045  0.01270086  0.01047699  0.00935513  0.00929294
  0.01161168  0.00957289  0.00945387  0.0104891 ]
Accuracy mean 

In [10]:
model_build(filenum=5,Target_column='y', df_train='d5_train', df_test='d5_test', Directory = "./Data Set 5/splits/",pos_label=None)

Data set# 5
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.88200946  0.88379154  0.88233526  0.88332886  0.88207405  0.8854829
  0.88086147  0.88217422  0.88371859  0.88466821]
Individual time taken for SVM
[ 20.36221461  19.43170171  18.69411231  20.59852614  19.31411576
  20.23576438  17.15074151  23.24116465   8.21384572   8.64552167]
Accuracy mean   Accuracy Stdev  
0.883044455408 0.00133096376778
Individual file AUC for SVM
[ 0.64380815  0.63844698  0.64602014  0.64944703  0.64061219  0.64335687
  0.64814455  0.64258841  0.65032874  0.64345774]
AUC mean        AUC      Stdev  
0.644621079156 0.00363557669635

********** RF classifier ************
Individual file accuracy for RF
[ 0.8971442   0.89863873  0.89870078  0.8980051   0.89923542  0.89813266
  0.89553189  0.89753264  0.89901701  0.89890463]
Individual time taken for RF
[ 0.01552947  0.01266751  0.01232602  0.01079557  0.01061963  0.01058186
  0.01347     0.00485911  0.00485956  0.0056161 ]
Accur

In [11]:
model_build(filenum=9,Target_column='Income level', df_train='d9_train', df_test='d9_test', Directory = "./Data Set 9/splits/",pos_label=None)

Data set# 9
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.81138371  0.81308733  0.81314879  0.81264717  0.81048567  0.81048567
  0.80903491  0.81437599  0.81320708  0.80897935]
Individual time taken for SVM
[ 0.55958044  0.64894236  0.6331141   0.59182204  0.73222483  0.679898
  0.5922849   0.70046557  0.64384485  0.65253357]
Accuracy mean   Accuracy Stdev  
0.81168356612 0.00178617472735
Individual file AUC for SVM
[ 0.80556321  0.80504605  0.8011173   0.8056653   0.80115635  0.8011582
  0.79518673  0.81614643  0.81046435  0.80113853]
AUC mean        AUC      Stdev  
0.804264246205 0.00552512687402

********** RF classifier ************
Individual file accuracy for RF
[ 0.80785083  0.81156212  0.8036085   0.81186224  0.80583197  0.80776276
  0.80492813  0.81062401  0.80892101  0.81208418]
Individual time taken for RF
[ 0.00295873  0.00292975  0.00337076  0.0030347   0.0030052   0.00291371
  0.00387893  0.00277873  0.00319638  0.00334646]
Accuracy mean   Ac

In [12]:
model_build(filenum=11,Target_column='C7130', df_train='amlall_train', df_test='amlall_test', Directory = "./data11_amlalll/",pos_label='AML')

Data set# 11
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.77272727  0.65384615  0.73529412  0.58333333  0.6         0.56521739
  0.68        0.55172414  0.62962963  0.62962963]
Individual time taken for SVM
[ 0.00246156  0.00214738  0.00170425  0.00278237  0.00261109  0.00239096
  0.00353917  0.00202015  0.00213817  0.00209996]
Accuracy mean   Accuracy Stdev  
0.640140166605 0.0684837819889
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 1.          0.88461538  0.91176471  0.70833333  0.9         0.86956522
  1.          0.65517241  0.77777778  0.85185185]
Individual time taken for RF
[ 0.00046276  0.00039272  0.00041273  0.00089715  0.00039511  0.00041037
  0.00040915  0.00039497  0.00039967  0.00037473]
Accuracy mean   Accuracy Stdev  
0.855908068465 0.10760741623
Individual file AUC for RF
[ 1.          0.9673

In [13]:
model_build(filenum=13,Target_column='C7130', df_train='central_train', df_test='central_test', Directory = "./data13_central/",pos_label='Class1')

Data set# 13
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.76470588  0.80952381  0.65217391  0.61538462  0.58333333  0.47619048
  0.66666667  0.35294118  0.72222222  0.4375    ]
Individual time taken for SVM
[ 0.00171499  0.00143571  0.00138675  0.00184539  0.00129607  0.00185683
  0.00204134  0.00294429  0.00162799  0.00168021]
Accuracy mean   Accuracy Stdev  
0.608064209519 0.139917957238
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 0.52941176  0.57142857  0.65217391  0.53846154  0.58333333  0.52380952
  0.58333333  0.35294118  0.66666667  0.5625    ]
Individual time taken for RF
[ 0.00044749  0.00037762  0.00037931  0.00037902  0.0003532   0.00080367
  0.00042001  0.00057935  0.00038164  0.00037877]
Accuracy mean   Accuracy Stdev  
0.556405982125 0.0815800477719
Individual file AUC for RF
[ 0.60576923  0.617

In [14]:
model_build(filenum=16,Target_column='C122', df_train='pros_train', df_test='pros_test', Directory = "./data16_pros/",pos_label=None)

ValueError: could not convert string to float: 'B'