In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew, boxcox
from math import exp, log
import xgboost as xgb
import numpy as np
from datetime import datetime
from os.path import join

In [16]:
# Global variables
SUB_PATH = 'submissions/'

In [2]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [26]:
def load_data(path_train='data/train.csv', path_test='data/test.csv'):
    train_loader = pd.read_csv(path_train, dtype={'id': np.int32})
    train = train_loader.drop(['id', 'loss'], axis=1)
    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    test = test_loader.drop(['id'], axis=1)
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    numeric_feats = train_test.dtypes[train_test.dtypes != "object"].index

    # compute skew and do Box-Cox transformation
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    # transform features with skew > 0.25 (this can be varied to find optimal value)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index
    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]
    # factorize categorical features
    for feat in cats:
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]
    x_train = train_test.iloc[:ntrain, :]
    x_test = train_test.iloc[ntrain:, :]
    train_test_scaled, scaler = scale_data(train_test)
    train, _ = scale_data(x_train, scaler)
    test, _ = scale_data(x_test, scaler)

    train_labels = np.log(np.array(train_loader['loss']))
    train_ids = train_loader['id'].values.astype(np.int32)
    test_ids = test_loader['id'].values.astype(np.int32)

    return train, train_labels, test, train_ids, test_ids

In [27]:
# Load data set and target values
train, target, test, _, ids = load_data()
d_train_full = xgb.DMatrix(train, label=target)
d_test = xgb.DMatrix(test)


Skew in numeric features:
0         0
1         0
2         0
3         1
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        1
13        1
14        0
15        0
16        0
17        0
18        0
19        0
20        1
21        0
22        1
23        1
24        1
25        0
26        0
27        1
28        0
29        0
         ..
313834    0
313835    0
313836    0
313837    1
313838    1
313839    0
313840    0
313841    0
313842    0
313843    0
313844    0
313845    0
313846    0
313847    0
313848    0
313849    0
313850    0
313851    0
313852    0
313853    1
313854    1
313855    0
313856    0
313857    0
313858    0
313859    0
313860    0
313861    1
313862    0
313863    0
Name: cat1, dtype: int64
0         1
1         1
2         1
3         1
4         1
5         1
6         0
7         1
8         1
9         1
10        1
11        1
12        0
13        0
14        0
15        0
16        1
17        0


In [19]:
# enter the number of folds from xgb.cv
folds = 5
cv_sum = 0
early_stopping = 25
fpred = []
xgb_rounds = []

In [20]:
# set up KFold that matches xgb.cv number of folds
kf = KFold(train.shape[0], n_folds=folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d\n' % (i + 1))
    X_train, X_val = train[train_index], train[test_index]
    y_train, y_val = target[train_index], target[test_index]

#######################################
#
# Define cross-validation variables
#
#######################################

    params = {}
    params['booster'] = 'gbtree'
    params['objective'] = "reg:linear"
    params['eval_metric'] = 'mae'
    params['eta'] = 0.1
    params['gamma'] = 0.5290
    params['min_child_weight'] = 4.2922
    params['colsample_bytree'] = 0.3085
    params['subsample'] = 0.9930
    params['max_depth'] = 7
    params['max_delta_step'] = 0
    params['silent'] = 1
    params['random_state'] = 1001

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

####################################
#  Build Model
####################################

    clf = xgb.train(params,
                    d_train,
                    100000,
                    watchlist,
                    early_stopping_rounds=early_stopping)

####################################
#  Evaluate Model and Predict
####################################

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print(' eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit))

####################################
#  Add Predictions and Average Them
####################################

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score
    
mpred = pred / folds
score = cv_sum / folds
print('\n Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))


 Fold 1

[0]	train-mae:6.46764	eval-mae:6.46368
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 25 rounds.
[1]	train-mae:5.82108	eval-mae:5.81689
[2]	train-mae:5.23911	eval-mae:5.23488
[3]	train-mae:4.71538	eval-mae:4.71096
[4]	train-mae:4.24404	eval-mae:4.23944
[5]	train-mae:3.81991	eval-mae:3.81532
[6]	train-mae:3.43811	eval-mae:3.4333
[7]	train-mae:3.09459	eval-mae:3.08964
[8]	train-mae:2.78545	eval-mae:2.78055
[9]	train-mae:2.50738	eval-mae:2.50239
[10]	train-mae:2.25726	eval-mae:2.25213
[11]	train-mae:2.03244	eval-mae:2.02713
[12]	train-mae:1.83054	eval-mae:1.82506
[13]	train-mae:1.64953	eval-mae:1.64377
[14]	train-mae:1.48755	eval-mae:1.48162
[15]	train-mae:1.34299	eval-mae:1.33709
[16]	train-mae:1.21461	eval-mae:1.20887
[17]	train-mae:1.10105	eval-mae:1.09578
[18]	train-mae:1.00155	eval-mae:0.996688
[19]	train-mae:0.914862	eval-mae:0.910386
[20]	train-mae:0.839937	eval-mae:0.835827
[21]	train-mae:

In [21]:
print("#\n Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("\n %d-fold average prediction:\n" % folds)
print(result.head())

#
 Writing results

 5-fold average prediction:

           loss
id             
4   1474.300049
6   2071.088379
9   7915.320312
12  5771.557617
15   811.891479


In [22]:
now = datetime.now()
score = str(round((cv_sum / folds), 6))
sub_file = join(SUB_PATH,'submission_5fold-average-xgb_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv')
print("\n Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')


 Writing submission: submissions/submission_5fold-average-xgb_1146.109784_2016-10-20-10-26.csv
