In [2]:
from itertools import chain
from sklearn import preprocessing

import numpy as np
import pandas as pd
import xgboost as xgb

In [3]:
train_file = "../data/train.csv"
test_file = "../data/test.csv"
sample_submission = "../data/sample_submission.csv"
submission_filename = "god_of_overfitting_spare_us.csv"

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 0,
    'subsample': .85,
    'eta': 0.0275,
    'objective': 'binary:logitraw',
    'num_parallel_tree': 7,
    'max_depth': 5,
    'nthread': 22,
    'eval_metric': 'auc',
}


In [63]:
def get_data():
    global tc_features

    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)

    y_train = train.OutcomeType

    train = train.drop(['AnimalID', 'OutcomeType'], axis=1)
    test = test.drop('ID', axis=1)

    ntrain = train.shape[0]

    train_test = pd.concat((train, test), axis=0)

    # modify date info 
    train_test['Date'] = pd.to_datetime(train_test['DateTime'])

    train_test['Year'] = train_test['Date'].dt.year
    train_test['Month'] = train_test['Date'].dt.month
    train_test['Day'] = train_test['Date'].dt.day
    train_test['Weekday'] = train_test['Date'].dt.dayofweek


    train_test = train_test.drop(['Date', 'DateTime'], axis=1)
    train_test = train_test.fillna(-1)

    # modify categoricals columns 
    categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']

    for c in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_test[c].values))
        train_test[c] = lbl.transform(list(train_test[c].values))

    train = train_test.iloc[:ntrain, :].copy().reset_index(drop=True)
    test = train_test.iloc[ntrain:, :].copy().reset_index(drop=True)

    features = list(train.columns)
    features = np.intersect1d(features, top111 + tmp_features)

    x_train = train[features].copy()
    x_test = test[features].copy()

    x_train['NaNCount'] = x_train.apply(lambda x: np.sum(x == -1), axis=1)
    x_test['NaNCount'] = x_test.apply(lambda x: np.sum(x == -1), axis=1)

    # get interactions columns
    for A, B in interactions2way:
        feat = "_".join([A, B])
        x_train[feat] = x_train[A] - x_train[B]
        x_test[feat] = x_test[A] - x_test[B]

    for A, B, C in interactions3way:
        feat = "_".join([A, B, C])
        tc_features += [feat]
        x_train[feat] = x_train[A] - x_train[B] - x_train[C]
        x_test[feat] = x_test[A] - x_test[B] - x_test[C]

    for A, B, C, D in interactions4way:
        feat = "_".join([A, B, C, D])
        tc_features += [feat]
        x_train[feat] = x_train[A] - x_train[B] - x_train[C] - x_train[D]
        x_test[feat] = x_test[A] - x_test[B] - x_test[C] - x_test[D]

    # drop columns not in top111  and last 25 columns in drop
    x_train.drop(tmp_features, axis=1, inplace=True)
    x_test.drop(tmp_features, axis=1, inplace=True)

    x_train.drop(drop_out[-25:], axis=1, inplace=True)
    x_test.drop(drop_out[-25:], axis=1, inplace=True)

    return np.array(x_train), np.array(y_train), np.array(x_test)

In [64]:
x_train, y_train, x_test = get_data()
print x_train.shape, x_test.shape

x_train_tc = x_train.copy()
ntcfeat = len(tc_features)

(260753, 164) (173836, 164)


In [66]:
# xtrain drop 3/4 way intersection
x_train[:, -ntcfeat:] = 0

ntrain = x_train.shape[0]
best_nrounds = 2500

dtrain = xgb.DMatrix(x_train, label=y_train)
dtrain_tc = xgb.DMatrix(x_train_tc, label=y_train)

In [75]:
evallist  = [(dtrain,'train')]

In [78]:
gbdt = xgb.train(xgb_params, dtrain, 5, evallist, verbose_eval =True)

[0]	train-auc:0.937852
[1]	train-auc:0.938665
[2]	train-auc:0.938199
[3]	train-auc:0.938540
[4]	train-auc:0.938967


In [82]:
import matplotlib.pyplot as plt
importance = xgb.plot_importance(gbdt)
plt.show()

In [83]:
gbdt.predict(dtest)

array([-0.25677505, -0.22170401, -0.22170401, ...,  0.19527936,
       -0.25677505, -0.19422986], dtype=float32)

In [84]:
submission = pd.read_csv(sample_submission)
submission.iloc[:, 1] = gbdt.predict(dtest).reshape((-1, 1))
submission.to_csv(submission_filename, index=False)