In [14]:
__author__ = 'Evgeny'

FOLDER = "/home/evgeny/kaggle/input/"

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
random.seed(123)

pd.set_option("display.max_columns", 99)

In [11]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')

In [16]:
def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 8
    subsample = 0.8
    colsample_bytree = 0.8

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 100
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    return test_prediction.tolist(), score

def create_submission(score, test, prediction):
    # Make Submission
    sub_file = 'submission_' + str(score) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [3]:
filename = "D1_20may.p"
pd_data = pd.read_hdf(FOLDER + filename)

In [5]:
features = pd_data.drop(['isDuplicate', 'id'], 1).columns.tolist()

In [7]:
train = pd_data[pd_data['isDuplicate'] >= 0]
test = pd_data[pd_data['isDuplicate'].isnull()]

In [15]:
train.head()

Unnamed: 0,title_diff_len,title_dlevenshtein,title_jarowinkler,title_num_same,description_diff_len,description_dlevenshtein,description_jarowinkler,price_diff,attrsJSON_diff_len,attrsJSON_dlevenshtein,attrsJSON_jarowinkler,images_diff_number,metroID_same,locationID_same,regionID_same,haversine,categoryID_same,isDuplicate,id
0,0,0.0,1.0,1,0,0.0,1.0,0.0,0,0.0,1.0,1,0,1,1,-0.205083,1,1.0,
1,13,0.666667,0.396825,1,116,0.666667,0.396825,0.2,0,0.666667,0.396825,1,0,1,1,-0.205083,1,0.0,
2,15,0.695652,0.3907,1,11,0.695652,0.3907,0.090909,0,0.695652,0.3907,1,0,1,1,-0.205083,1,0.0,
3,0,0.0,1.0,1,204,0.0,1.0,0.068826,0,0.0,1.0,5,0,1,1,-0.205083,1,1.0,
4,0,0.0,1.0,1,91,0.0,1.0,0.068826,717,0.0,1.0,6,0,1,1,-0.205083,1,1.0,


In [17]:
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
create_submission(score, test, test_prediction)

XGBoost params. ETA: 0.1, MAX_DEPTH: 8, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.794787	eval-auc:0.794400
[1]	train-auc:0.798475	eval-auc:0.798039
[2]	train-auc:0.800580	eval-auc:0.800165
[3]	train-auc:0.802826	eval-auc:0.802381
[4]	train-auc:0.803347	eval-auc:0.802842
[5]	train-auc:0.804161	eval-auc:0.803672
[6]	train-auc:0.806130	eval-auc:0.805668
[7]	train-auc:0.806499	eval-auc:0.805960
[8]	train-auc:0.807222	eval-auc:0.806648
[9]	train-auc:0.807417	eval-auc:0.806844
[10]	train-auc:0.807770	eval-auc:0.807158
[11]	train-auc:0.808258	eval-auc:0.807644
[12]	train-auc:0.808809	eval-auc:0.808215
[13]	train-auc:0.809310	eval-auc:0.808726
[14]	train-auc:0.809600	eval-auc:0.808978
[15]	train-auc:0.810644	eval-auc:0.809962
[16]	train-auc:0.811122	eval-auc:0.810443
[17]	train-auc:0.811386	eval-auc:0.810686
[18]	train-auc:0.811733	eval-auc:0.811003
[19]	train-auc:0.812031	eval-auc:0.811305
[20]	train-auc:0.812258	eval-auc:0.811494
[21]	train-auc:0.813061	eval-auc:0.812239
[22]	train-auc:0.813411	eva

Validating...
Check error value: 0.824366
Importance array:  [('price_diff', 3674), ('title_jarowinkler', 2969), ('haversine', 2719), ('description_diff_len', 2616), ('attrsJSON_diff_len', 2467), ('title_dlevenshtein', 2375), ('title_diff_len', 1568), ('images_diff_number', 1158), ('description_dlevenshtein', 892), ('description_jarowinkler', 567), ('title_num_same', 510), ('regionID_same', 401), ('metroID_same', 367), ('locationID_same', 330), ('attrsJSON_dlevenshtein', 291), ('attrsJSON_jarowinkler', 87)]
Predict test set...
Writing submission:  submission_0.82436599799.csv
