In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew, boxcox
from math import exp, log
import xgboost as xgb
import numpy as np
from datetime import datetime
from os.path import join
#from bayes_opt import BayesianOptimization

In [2]:
# Global variables
SUB_PATH = 'submissions/'

In [3]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [4]:
def load_data(path_train='data/train.csv', path_test='data/test.csv'):
    train_loader = pd.read_csv(path_train, dtype={'id': np.int32})
    train = train_loader.drop(['id', 'loss'], axis=1)
    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    test = test_loader.drop(['id'], axis=1)
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    numeric_feats = train_test.dtypes[train_test.dtypes != "object"].index

    # compute skew and do Box-Cox transformation
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    # transform features with skew > 0.25 (this can be varied to find optimal value)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index
    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
        
    del(skewed_feats)    
    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]
    #category_count = train_test[cats].apply(lambda x:x.value_counts(),axis=1)
    #category_count = pd.read_csv('category_count.csv',header=0,index_col=0)
    #category_count = category_count.fillna(0)

    # factorize categorical features
    for feat in cats:
        labels,unique = pd.factorize(train_test[feat], sort=True)
        train_test[feat] = labels
        #dummies = pd.get_dummies(train_test[feat])
        #dummies.columns = unique
        #dummies = dummies.add_prefix(feat+"_")
        #train_test = train_test.join(dummies)
    #train_test = train_test.join(pd.DataFrame(0,index=train_test.index,columns=categories))
    
    x_train = train_test.iloc[:ntrain, :]
    x_test = train_test.iloc[ntrain:, :]
    train_test_scaled, scaler = scale_data(train_test)
    train, _ = scale_data(x_train, scaler)
    test, _ = scale_data(x_test, scaler)
    #train = np.append(train,category_count.iloc[:ntrain,:].as_matrix(),axis=1)
    #test = np.append(test,category_count.iloc[ntrain:,:].as_matrix(),axis=1)
    
    train_labels = np.log(np.array(train_loader['loss']))
    train_ids = train_loader['id'].values.astype(np.int32)
    test_ids = test_loader['id'].values.astype(np.int32)

    return train, train_labels, test, train_ids, test_ids

In [5]:
# Load data set and target values
train, target, test, _, ids = load_data()
d_train_full = xgb.DMatrix(train, label=target)
d_test = xgb.DMatrix(test)


Skew in numeric features:
cont1     0.516420
cont2    -0.310939
cont3    -0.010002
cont4     0.416093
cont5     0.681617
cont6     0.461211
cont7     0.826046
cont8     0.676629
cont9     1.072420
cont10    0.354998
cont11    0.280819
cont12    0.291990
cont13    0.380739
cont14    0.248672
dtype: float64


In [6]:
# enter the number of folds from xgb.cv
folds = 5
cv_sum = 0
early_stopping = 25
fpred = []
xgb_rounds = []

In [7]:
# set up KFold that matches xgb.cv number of folds
kf = KFold(train.shape[0], n_folds=folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d\n' % (i + 1))
    X_train, X_val = train[train_index], train[test_index]
    y_train, y_val = target[train_index], target[test_index]

#######################################
#
# Define cross-validation variables
#
#######################################

    params = {}
    params['booster'] = 'gbtree'
    params['objective'] = "reg:linear"
    params['eval_metric'] = 'mae'
    params['eta'] = 0.1
    params['gamma'] = 0.5290
    params['min_child_weight'] = 4.2922
    params['colsample_bytree'] = 0.3085
    params['subsample'] = 0.9930
    params['max_depth'] = 7
    params['max_delta_step'] = 0
    params['silent'] = 1
    params['random_state'] = 1001

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

####################################
#  Build Model
####################################

    clf = xgb.train(params,
                    d_train,
                    100000,
                    watchlist,
                    early_stopping_rounds=early_stopping)

####################################
#  Evaluate Model and Predict
####################################

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print(' eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit))

####################################
#  Add Predictions and Average Them
####################################

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score
    
mpred = pred / folds
score = cv_sum / folds
print('\n Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))


 Fold 1

[0]	train-mae:6.46767	eval-mae:6.46343
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 25 rounds.
[1]	train-mae:5.82111	eval-mae:5.81655
[2]	train-mae:5.23915	eval-mae:5.23424
[3]	train-mae:4.71542	eval-mae:4.71023
[4]	train-mae:4.24406	eval-mae:4.23893
[5]	train-mae:3.81985	eval-mae:3.8148
[6]	train-mae:3.43812	eval-mae:3.43276
[7]	train-mae:3.09459	eval-mae:3.0892
[8]	train-mae:2.78548	eval-mae:2.78012
[9]	train-mae:2.50735	eval-mae:2.50186
[10]	train-mae:2.25727	eval-mae:2.25153
[11]	train-mae:2.03245	eval-mae:2.02658
[12]	train-mae:1.83057	eval-mae:1.82453
[13]	train-mae:1.64953	eval-mae:1.64337
[14]	train-mae:1.48747	eval-mae:1.48128
[15]	train-mae:1.34284	eval-mae:1.33653
[16]	train-mae:1.21418	eval-mae:1.20808
[17]	train-mae:1.10052	eval-mae:1.09477
[18]	train-mae:1.00088	eval-mae:0.995439
[19]	train-mae:0.913968	eval-mae:0.908863
[20]	train-mae:0.838664	eval-mae:0.834121
[21]	train-mae:0

In [8]:
print("#\n Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("\n %d-fold average prediction:\n" % folds)
print(result.head())

#
 Writing results

 5-fold average prediction:

           loss
id             
4   1464.001953
6   1976.753540
9   8810.784180
12  6470.567871
15   829.028809


In [9]:
now = datetime.now()
score = str(round((cv_sum / folds), 6))
sub_file = join(SUB_PATH,'submission_5fold-average-xgb_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv')
print("\n Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')


 Writing submission: submissions/submission_5fold-average-xgb_1144.266474_2016-10-23-15-51.csv
