# Data Process class

In [2]:
class Dataprocess:
    def getAugmentedData(self,data):
        _,cols = data.shape
        ones = np.ones((1,cols))
        data = np.vstack((ones,data))
        return data
    def __init__(self,data_path,label_path=None):
        if label_path != None:
            with open(data_path) as f:
                self.data = np.loadtxt(f,str,skiprows=1,delimiter = ",")[:,1:].astype(np.float64)
                self.data = self.data.T
            with open(label_path) as f:
                label = np.loadtxt(f,str,skiprows=1,delimiter = ",",usecols=(1,))[np.newaxis,:]
                _,cols = label.shape
                self.label = np.ones((1,cols))
                lag = np.unique(label)[1]
                self.label[label == lag] = 0
                self.label = self.label.T
        else:
            self.data = data_path[0]
            self.label = data_path[1]
    def getCrossValidationData(self,k):
        kf = KFold(n_splits=k,shuffle=True)
        subData_train = []
        subData_test = []
        for train_index, test_index in kf.split(self.data):
            #print('train_index', train_index, 'test_index', test_index)
            train_X, train_y = self.data[train_index], self.label[train_index]
            test_X, test_y = self.data[test_index], self.label[test_index]
            #print("trainx shape",train_X.shape,"testx shape",test_X.shape)
            subdata1 = Dataprocess([train_X,train_y])
            subdata2 = Dataprocess([test_X,test_y])
            subData_train.append(subdata1)
            subData_test.append(subdata2)
        return subData_train,subData_test

# Import a nearest neighbors classification model from sklearn

## Import necessary module and prepare data

In [3]:
import numpy as np
from sklearn import neighbors

trainingset_1 = Dataprocess("train_10gene_sub.csv","train_10gene_label_sub.csv")
trainingset_2 = Dataprocess("train_10gene.csv","train_label.csv")
testset = Dataprocess("test_10gene.csv","test_label.csv")
testset_2 = Dataprocess("test2_10gene.csv","test2_label.csv")

DataSet = {"trainingset-1":trainingset_1,"trainingset-2":trainingset_2,"testset":testset,"testset-2":testset_2}

def GetTrainingDataAndTestData(name):
    data = DataSet[name]
    return data.data,np.squeeze(data.label)

## Train the k-NN classifier and visualize the result

In [11]:
import prettytable as pt

tb = pt.PrettyTable(["trainingdata&","testdata&","weights&","n-neighbors&","training accuracy&","test accuracy\\\ \\hline"])

for trainingdata in ["trainingset-1","trainingset-2"]:
    for testdata in ["testset","testset-2"]:
        if trainingdata == "trainingset-2" and testdata == "testset-2":
            continue
        for weights in ['uniform', 'distance']:
            #for algorithm in ['auto','ball_tree', 'kd_tree', 'brute']:
                for n_neighbors in [5,10,15,20,25,30]:
                    # we create an instance of Neighbours Classifier and fit the data.
                    trainx,trainy = GetTrainingDataAndTestData(trainingdata)
                    testx,testy = GetTrainingDataAndTestData(testdata)
                    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights,algorithm="auto")
                    clf.fit(trainx,trainy)
                    train_score = clf.score(trainx,trainy)
                    test_score = clf.score(testx,testy)
                    
                    tb.add_row([trainingdata+"&",testdata+"&",weights+"&",str(n_neighbors)+"&",str(train_score)+"&",str(test_score)+"\\\\"])
                    #print("Using training data %s with weight %s, n_neighbors %i and algorithm %s,we can get the training score is %lf and test score on %s is %lf"%(trainingdata,weights,n_neighbors,algorithm,train_score,testdata,test_score))
print(tb)

+----------------+------------+-----------+--------------+---------------------+------------------------+
| trainingdata&  | testdata&  |  weights& | n-neighbors& |  training accuracy& | test accuracy\\ \hline |
+----------------+------------+-----------+--------------+---------------------+------------------------+
| trainingset-1& |  testset&  |  uniform& |      5&      |        0.925&       |         0.88\\         |
| trainingset-1& |  testset&  |  uniform& |     10&      |         0.8&        |         0.8\\          |
| trainingset-1& |  testset&  |  uniform& |     15&      |        0.75&        |         0.73\\         |
| trainingset-1& |  testset&  |  uniform& |     20&      |        0.65&        |         0.57\\         |
| trainingset-1& |  testset&  |  uniform& |     25&      |        0.65&        |         0.55\\         |
| trainingset-1& |  testset&  |  uniform& |     30&      |        0.65&        |         0.55\\         |
| trainingset-1& |  testset&  | distance& |   

## Change the tabel to a latex version

In [12]:
tb.border=0
print(tb)

 trainingdata&   testdata&    weights&  n-neighbors&   training accuracy&  test accuracy\\ \hline 
 trainingset-1&   testset&    uniform&       5&              0.925&                0.88\\         
 trainingset-1&   testset&    uniform&      10&               0.8&                 0.8\\          
 trainingset-1&   testset&    uniform&      15&              0.75&                 0.73\\         
 trainingset-1&   testset&    uniform&      20&              0.65&                 0.57\\         
 trainingset-1&   testset&    uniform&      25&              0.65&                 0.55\\         
 trainingset-1&   testset&    uniform&      30&              0.65&                 0.55\\         
 trainingset-1&   testset&   distance&       5&               1.0&                 0.87\\         
 trainingset-1&   testset&   distance&      10&               1.0&                 0.83\\         
 trainingset-1&   testset&   distance&      15&               1.0&                 0.83\\         
 trainings

# Import a RF model from sklearn

## Import necessary module

In [6]:
from sklearn.ensemble import RandomForestClassifier

## train RF model and visualize the result

In [9]:
tb = pt.PrettyTable(["trainingdata&","testdata&","criterion&","max_features&","n-estimators&","training accuracy&","test accuracy\\\ \\hline"])

for trainingdata in ["trainingset-1","trainingset-2"]:
    for testdata in ["testset","testset-2"]:
        if trainingdata == "trainingset-2" and testdata == "testset-2":
            continue
        for criterion in ['gini', 'entropy']:
            for max_features in ['sqrt','log2']:
            #for algorithm in ['auto','ball_tree', 'kd_tree', 'brute']:
                for n_estimators in [5,10,15,20,25,30]:
                    # we create an instance of Neighbours Classifier and fit the data.
                    trainx,trainy = GetTrainingDataAndTestData(trainingdata)
                    testx,testy = GetTrainingDataAndTestData(testdata)
                    clf = RandomForestClassifier(n_estimators=n_estimators,criterion=criterion,max_features=max_features)
                    clf.fit(trainx,trainy)
                    train_score = clf.score(trainx,trainy)
                    test_score = clf.score(testx,testy)
                    tb.add_row([trainingdata+"&",testdata+"&",criterion+"&",max_features+"&",str(n_estimators)+"&",str(train_score)+"&",str(test_score)+"\\\\"])
print(tb)

+----------------+------------+------------+---------------+---------------+---------------------+------------------------+
| trainingdata&  | testdata&  | criterion& | max_features& | n-estimators& |  training accuracy& | test accuracy\\ \hline |
+----------------+------------+------------+---------------+---------------+---------------------+------------------------+
| trainingset-1& |  testset&  |   gini&    |     sqrt&     |       5&      |        0.975&       |         0.88\\         |
| trainingset-1& |  testset&  |   gini&    |     sqrt&     |      10&      |         1.0&        |         0.87\\         |
| trainingset-1& |  testset&  |   gini&    |     sqrt&     |      15&      |         1.0&        |         0.88\\         |
| trainingset-1& |  testset&  |   gini&    |     sqrt&     |      20&      |         1.0&        |         0.91\\         |
| trainingset-1& |  testset&  |   gini&    |     sqrt&     |      25&      |         1.0&        |         0.93\\         |
| traini

In [10]:
tb.border=0
print(tb)

 trainingdata&   testdata&   criterion&  max_features&  n-estimators&   training accuracy&  test accuracy\\ \hline 
 trainingset-1&   testset&     gini&         sqrt&            5&              0.975&                0.88\\         
 trainingset-1&   testset&     gini&         sqrt&           10&               1.0&                 0.87\\         
 trainingset-1&   testset&     gini&         sqrt&           15&               1.0&                 0.88\\         
 trainingset-1&   testset&     gini&         sqrt&           20&               1.0&                 0.91\\         
 trainingset-1&   testset&     gini&         sqrt&           25&               1.0&                 0.93\\         
 trainingset-1&   testset&     gini&         sqrt&           30&               1.0&                 0.89\\         
 trainingset-1&   testset&     gini&         log2&            5&              0.975&                0.87\\         
 trainingset-1&   testset&     gini&         log2&           10&        