In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, classification_report

os.chdir('/home/ankushraut/Downloads/inter_iit_tech')
data = pd.read_csv('ExoTrain.csv')

In [144]:
data.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
1,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
2,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
3,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54
4,2,211.1,163.57,179.16,187.82,188.46,168.13,203.46,178.65,166.49,...,-98.45,30.34,30.34,29.62,28.8,19.27,-43.9,-41.63,-52.9,-16.16


In [145]:
target = pd.DataFrame({'LABEL':data.LABEL})
data = data.drop(labels = ['LABEL'], axis = 1)
target.LABEL.value_counts()

1    3927
2      33
Name: LABEL, dtype: int64

In [146]:
#random sampling into training and testing sets

x_data, x_test, y_data, y_test = train_test_split(data, target, test_size = 0.1, random_state = 7)
x_data = x_data.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_data = y_data.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [147]:
y_data.LABEL.value_counts()

1    3532
2      32
Name: LABEL, dtype: int64

In [148]:
y_test.LABEL.value_counts()

1    395
2      1
Name: LABEL, dtype: int64

In [149]:
#random sampling into training and validation sets

x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size = .15, random_state = 7)
x_train = x_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
x_val = x_val.reset_index(drop = True)
y_val = y_val.reset_index(drop = True)

In [150]:
y_val.LABEL.value_counts()

1    530
2      5
Name: LABEL, dtype: int64

In [151]:
y_train.LABEL.value_counts()

1    3002
2      27
Name: LABEL, dtype: int64

In [152]:
#baseline approach : LABEL = 1
val = np.ones(len(y_train))
pred = np.ones(len(y_test))
print(f1_score(y_train.LABEL, val), f1_score(y_test.LABEL, pred))
bm_score = f1_score(y_test.LABEL, pred)

0.995523130492 0.998735777497


In [153]:
#converting 1 -> 0 and 2 -> 1 for convenience

def for_xgb(data):
    if data == 1:
        data = 0
    else:
        data = 1
    return data

y_train['LABEL'] = y_train['LABEL'].map(lambda x: for_xgb(x))
y_test['LABEL'] = y_test['LABEL'].map(lambda x: for_xgb(x))
y_val['LABEL'] = y_val['LABEL'].map(lambda x: for_xgb(x))

In [154]:
y_val.head()

Unnamed: 0,LABEL
0,0
1,0
2,0
3,0
4,0


In [155]:
#baseline xgboost
params = {"objective":"binary:logistic",
          "booster":"gbtree",
          "eta":0.1,
          "max_depth": 10,
         "subsample":0.75,
          "silent":0,
         "seed":7,
         "lambda":5,
         "alpha":5,
         "eval_metric":"auc"}

num_boost_round = 100

x_tr = x_train
y_tr = y_train

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_val, y_val)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

booster = xgb.train(params, dtrain, num_boost_round, evals = watchlist)

training_probs = booster.predict(xgb.DMatrix(x_train))
probs = []
for i in range(len(training_probs)):
    probs.append(training_probs[i])

check = booster.predict(xgb.DMatrix(x_val))
check_values = []
for i in range(len(check)):
    check_values.append(check[i])

[0]	train-auc:0.5	eval-auc:0.5
[1]	train-auc:0.535871	eval-auc:0.495283
[2]	train-auc:0.535871	eval-auc:0.495283
[3]	train-auc:0.535871	eval-auc:0.495283
[4]	train-auc:0.535871	eval-auc:0.495283
[5]	train-auc:0.587658	eval-auc:0.487736
[6]	train-auc:0.587658	eval-auc:0.487736
[7]	train-auc:0.58688	eval-auc:0.486792
[8]	train-auc:0.603542	eval-auc:0.484906
[9]	train-auc:0.603178	eval-auc:0.483962
[10]	train-auc:0.602463	eval-auc:0.483962
[11]	train-auc:0.61913	eval-auc:0.481132
[12]	train-auc:0.618822	eval-auc:0.481132
[13]	train-auc:0.754306	eval-auc:0.552264
[14]	train-auc:0.754041	eval-auc:0.552264
[15]	train-auc:0.753868	eval-auc:0.552264
[16]	train-auc:0.786155	eval-auc:0.544528
[17]	train-auc:0.785415	eval-auc:0.637358
[18]	train-auc:0.907741	eval-auc:0.72434
[19]	train-auc:0.907987	eval-auc:0.725849
[20]	train-auc:0.906457	eval-auc:0.722642
[21]	train-auc:0.910979	eval-auc:0.721509
[22]	train-auc:0.910578	eval-auc:0.721887
[23]	train-auc:0.911886	eval-auc:0.718113
[24]	train-auc:

In [156]:
#reverse conversion into original labels

def reverse_xgb(data):
    if data == 0:
        data = 1
    else:
        data = 2
    return data
        
y_train['LABEL'] = y_train['LABEL'].map(lambda x: reverse_xgb(x))
y_test['LABEL'] = y_test['LABEL'].map(lambda x: reverse_xgb(x))
y_val['LABEL'] = y_val['LABEL'].map(lambda x: reverse_xgb(x))

In [157]:
#creation of validation predictions

validation = pd.DataFrame({'LABEL':check_values})

In [158]:
def converter(thresh, data):
    if data <= thresh:
        data = 1
    else:
        data = 2
    return data

validation['LABEL'] = validation['LABEL'].map(lambda x: converter(0.05, x))

In [159]:
validation.head()

Unnamed: 0,LABEL
0,1
1,1
2,1
3,1
4,1


In [160]:
#performance on validation set

print(f1_score(y_val, validation))
confusion_matrix(y_val, validation, labels = [1,2])

0.97794822627


array([[510,  20],
       [  3,   2]])

In [161]:
print(classification_report(y_val, validation))

             precision    recall  f1-score   support

          1       0.99      0.96      0.98       530
          2       0.09      0.40      0.15         5

avg / total       0.99      0.96      0.97       535



In [162]:
#creation of training set predictions

train_probs = pd.DataFrame({'LABEL':probs})
train_probs['LABEL'] = train_probs['LABEL'].map(lambda x: converter(0.05, x))

In [163]:
#performance on training set

print(classification_report(y_train, train_probs))

             precision    recall  f1-score   support

          1       1.00      0.99      0.99      3002
          2       0.40      1.00      0.57        27

avg / total       0.99      0.99      0.99      3029



In [164]:
confusion_matrix(y_train, train_probs)

array([[2961,   41],
       [   0,   27]])

In [165]:
#creation of test set predictions

tester = booster.predict(xgb.DMatrix(x_test))
test_values = []
for i in range(len(tester)):
    test_values.append(tester[i])

In [166]:
test_pred = pd.DataFrame({'LABEL':test_values})
test_pred['LABEL'] = test_pred['LABEL'].map(lambda x: converter(0.05, x))

In [167]:
#performance on test set

print(classification_report(y_test, test_pred))

             precision    recall  f1-score   support

          1       1.00      0.95      0.97       395
          2       0.00      0.00      0.00         1

avg / total       0.99      0.95      0.97       396



In [168]:
confusion_matrix(y_test, test_pred)

array([[375,  20],
       [  1,   0]])