In [1]:
__author__ = 'Evgeny'

FOLDER = "/home/evgeny/kaggle/input/"

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
random.seed(123)

pd.set_option("display.max_columns", 99)

In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')

In [8]:
def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 8
    subsample = 0.8
    colsample_bytree = 0.8

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 200
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    return test_prediction.tolist(), score

def create_submission(score, test, prediction):
    # Make Submission
    sub_file = 'submission_' + str(score) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [13]:
filename = "D1_23may.p"
pd_data = pd.read_hdf(FOLDER + filename)

In [14]:
features = pd_data.drop(['isDuplicate', 'id'], 1).columns.tolist()

In [15]:
train = pd_data[pd_data['isDuplicate'] >= 0]
train.drop(['id'],1, inplace=True)
test = pd_data[pd_data['isDuplicate'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [16]:
train.head()

Unnamed: 0,title_diff_len,title_dlevenshtein,title_jarowinkler,title_num_same,title_1_len,title_2_len,description_diff_len,description_dlevenshtein,description_jarowinkler,description_1_len,description_2_len,price_same,price_diff,attrsJSON_diff_len,attrsJSON_dlevenshtein,attrsJSON_jarowinkler,attrsJSON_1_len,attrsJSON_2_len,images_diff_number,metroID_same,locationID_same,regionID_same,haversine,categoryID_same,isDuplicate
0,0,0.0,1.0,1,17,17,0,0.0,1.0,25,25,1,0.0,0,0.0,1.0,27,27,1,0,1,1,-0.205083,1,1.0
1,13,0.666667,0.396825,1,21,8,116,0.666667,0.396825,164,48,0,0.2,0,0.666667,0.396825,35,35,1,0,1,1,-0.205083,1,0.0
2,15,0.695652,0.3907,1,8,23,11,0.695652,0.3907,48,37,0,0.090909,0,0.695652,0.3907,35,35,1,0,1,1,-0.205083,1,0.0
3,0,0.0,1.0,1,27,27,204,0.0,1.0,107,311,0,0.068826,0,0.0,1.0,359,359,5,0,1,1,-0.205083,1,1.0
4,0,0.0,1.0,1,27,27,91,0.0,1.0,311,220,0,0.068826,717,0.0,1.0,359,1076,6,0,1,1,-0.205083,1,1.0


In [17]:
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
create_submission(score, test, test_prediction)

XGBoost params. ETA: 0.1, MAX_DEPTH: 8, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.803295	eval-auc:0.802615
[1]	train-auc:0.815452	eval-auc:0.814929
[2]	train-auc:0.818370	eval-auc:0.817889
[3]	train-auc:0.819882	eval-auc:0.819445
[4]	train-auc:0.821609	eval-auc:0.821163
[5]	train-auc:0.822518	eval-auc:0.822048
[6]	train-auc:0.823995	eval-auc:0.823512
[7]	train-auc:0.825097	eval-auc:0.824628
[8]	train-auc:0.826243	eval-auc:0.825763
[9]	train-auc:0.828000	eval-auc:0.827489
[10]	train-auc:0.829072	eval-auc:0.828555
[11]	train-auc:0.830992	eval-auc:0.830508
[12]	train-auc:0.831492	eval-auc:0.830969
[13]	train-auc:0.832030	eval-auc:0.831491
[14]	train-auc:0.833449	eval-auc:0.832897
[15]	train-auc:0.834073	eval-auc:0.833510
[16]	train-auc:0.834836	eval-auc:0.834242
[17]	train-auc:0.835572	eval-auc:0.834921
[18]	train-auc:0.836425	eval-auc:0.835754
[19]	train-auc:0.837104	eval-auc:0.836412
[20]	train-auc:0.837709	eval-auc:0.837001
[21]	train-auc:0.838037	eval-auc:0.837291
[22]	train-auc:0.838578	eva

Validating...
Check error value: 0.870363
Importance array:  [('attrsJSON_1_len', 5108), ('attrsJSON_2_len', 4773), ('price_diff', 3856), ('description_2_len', 3334), ('description_1_len', 3154), ('title_dlevenshtein', 3059), ('description_diff_len', 3002), ('title_jarowinkler', 2850), ('attrsJSON_diff_len', 2350), ('title_2_len', 2243), ('haversine', 2191), ('title_1_len', 2191), ('title_diff_len', 1558), ('images_diff_number', 1117), ('description_jarowinkler', 662), ('title_num_same', 634), ('description_dlevenshtein', 605), ('metroID_same', 549), ('regionID_same', 357), ('locationID_same', 346), ('price_same', 196), ('attrsJSON_dlevenshtein', 91), ('attrsJSON_jarowinkler', 81)]
Predict test set...
Writing submission:  submission_0.870363357859.csv
