In [101]:
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

## Load Data and Observation

In [77]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [78]:
train[:3]

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0


In [79]:
train.keys()

Index(['id', 'MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)',
       'NK (cells/ul)', 'CD19+ (cells/ul)', 'CD45+ (cells/ul)', 'Age',
       'Sex 0M1F', 'Mono CD64+MFI (cells/ul)', 'label'],
      dtype='object')

In [80]:
labels = train['label']
del train['label']

In [81]:
# Fill NaN

for key in train.keys():
    train[key].fillna(train[key].mean(),inplace=True)
for key in test.keys():
    test[key].fillna(test[key].mean(),inplace=True)

## Models

In [82]:
# K-folds is used to select paras
folds = 5
num = math.ceil(len(train)/folds)

### Random Forrest

In [83]:
estimators = [25, 50, 75, 100]

In [84]:
train_np = train.to_numpy()
test_np = test.to_numpy()
labels_np = labels.to_numpy().reshape(-1,1)

In [85]:
for e in estimators:
    start = 0
    acc = 0
    for i in range(folds):
        val_x = train[start:start+num]
        val_y = labels_np[start:start+num].ravel()
        train_x = np.vstack((train[:start], train[start+num:]))
        train_y = np.vstack((labels_np[:start], labels_np[start+num:])).ravel()
        rf_clf = RandomForestClassifier(n_estimators=e)
        rf_clf.fit(train_x,train_y)
        acc += rf_clf.score(val_x,val_y)
        start = start+num
    acc /= folds
    print("Estimators:",e, "Acc:",acc)

Estimators: 25 Acc: 0.8844444444444445
Estimators: 50 Acc: 0.8466666666666667
Estimators: 75 Acc: 0.8866666666666667
Estimators: 100 Acc: 0.8733333333333334


In [86]:
rf_clf = RandomForestClassifier(n_estimators=75)
rf_clf.fit(train,labels_np.ravel())
results_rf = rf_clf.predict(test)

In [87]:
results_rf

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [None]:
[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 4, 4,
       3, 0, 0, 4, 0, 0, 4, 0, 2, 4, 0, 4, 0, 0, 0, 3, 4, 0, 4, 0, 1, 2,
       0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]



array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [None]:
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [None]:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### Logistic Regression

In [61]:
def norm(series):
    return series.apply(lambda x: (x-series.mean())/ series.std())

In [62]:
for key in train.keys():
    if key == 'id':
        continue
    train[key] = norm(train[key])
for key in test.keys():
    if key == 'id':
        continue
    test[key] = norm(test[key])   

In [88]:
reg = ['l1', 'l2']
lamb = [0.3,0.5,0.7]
train_np = train.to_numpy()
test_np = test.to_numpy()

In [89]:
for r in reg:
    for l in lamb:    
        start = 0
        acc = 0
        for i in range(folds):
            val_x = train_np[start:start+num]
            val_y = labels_np[start:start+num].ravel()
            train_x = np.vstack((train_np[:start], train_np[start+num:]))
            train_y = np.vstack((labels_np[:start], labels_np[start+num:])).ravel()
            lr_clf = LogisticRegression(penalty=r, C=l, solver="liblinear")
            lr_clf.fit(train_x,train_y)
            acc += lr_clf.score(val_x,val_y)
            start = start+num
        acc /= folds
        print("Reg=",r,"lambda:",l, "Acc:",acc)



Reg= l1 lambda: 0.3 Acc: 0.8466666666666667




Reg= l1 lambda: 0.5 Acc: 0.86
Reg= l1 lambda: 0.7 Acc: 0.8466666666666667
Reg= l2 lambda: 0.3 Acc: 0.8466666666666667
Reg= l2 lambda: 0.5 Acc: 0.8355555555555554
Reg= l2 lambda: 0.7 Acc: 0.8355555555555554




In [90]:
lr_clf = LogisticRegression(penalty='l1', C=0.5, solver="liblinear")
lr_clf.fit(train,labels_np.ravel())
results_lr = lr_clf.predict(test)

In [91]:
results_lr

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

### KNN

In [92]:
del train['Sex 0M1F']
del test['Sex 0M1F']

In [93]:
Neighbors = [1, 3, 5, 10, 15]
train_np = train.to_numpy()
test_np = test.to_numpy()

In [94]:
for n in Neighbors:
    start = 0
    acc = 0
    for i in range(folds):
        val_x = train_np[start:start+num]
        val_y = labels_np[start:start+num].ravel()
        train_x = np.vstack((train_np[:start], train_np[start+num:]))
        train_y = np.vstack((labels_np[:start], labels_np[start+num:])).ravel()
        knn_clf = KNeighborsClassifier(n_neighbors=n)
        knn_clf.fit(train_x,train_y)
        acc += knn_clf.score(val_x,val_y)
        start = start+num
    acc /= folds
    print("N=",n, "Acc:",acc)

N= 1 Acc: 0.7444444444444445
N= 3 Acc: 0.8066666666666666
N= 5 Acc: 0.8044444444444444
N= 10 Acc: 0.7555555555555555
N= 15 Acc: 0.78


In [95]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(train,labels_np.ravel())
results_knn = knn_clf.predict(test)

In [96]:
results_knn

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

### SVM

In [105]:
svm_clf = svm.SVC()
svm_clf.fit(train,labels_np.ravel())
results_svm = svm_clf.predict(test)

In [106]:
results_svm

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

## Ensemble

In [114]:
result = results_rf + results_lr + results_knn + results_svm
result

array([0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 4, 4,
       3, 0, 0, 4, 0, 0, 4, 0, 2, 4, 0, 4, 0, 0, 0, 3, 4, 0, 4, 0, 1, 2,
       0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4])

In [115]:
for i in range(len(result)):
    if result[i] >= 3:
        result[i] = 1
    else:
        result[i] = 0

In [116]:
result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [117]:
out = pd.DataFrame()
out['id'] = test['id']
out['label'] = result
out.to_csv('submission.csv',index=None)

In [75]:
sum(labels_np)

array([29])