In [185]:
from __future__ import unicode_literals
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder


In [3]:
pd.options.display.max_columns = None

In [212]:
train = pd.read_csv('ItemInfo_trainfull.csv', encoding='utf-8', nrows=15000)

In [213]:
# Null values are numerical, impute with zero
train = train.fillna(value=0,axis=1)

In [214]:
train.isnull().sum()

itemID                 0
categoryID_x           0
title_x                0
description_x          0
images_array_x         0
attrsJSON_x            0
price_x                0
locationID_x           0
metroID_x              0
lat_x                  0
lon_x                  0
itemID_2               0
isDuplicate            0
generationMethod       0
categoryID_y           0
title_y                0
description_y          0
images_array_y         0
attrsJSON_y            0
price_y                0
locationID_y           0
metroID_y              0
lat_y                  0
lon_y                  0
parentCategoryID_x     0
parentCategoryID_y     0
regionID_x             0
regionID_y             0
lendiff_imagearray     0
priceDifference        0
latlonDifference       0
fuzz_ratio             0
lev_dist               0
jaro_dist              0
jarow_dist             0
description_x_clean    0
description_y_clean    0
intersect_BOW          0
sym_diff_BOW           0
clusters               0


In [215]:
nonnum_columns = [key for key in dict(train.dtypes) if dict(train.dtypes)[key] == 'object']

In [216]:
le = LabelEncoder()
for feature in nonnum_columns:
    train[feature] = le.fit_transform(train[feature])

In [217]:
y = train['isDuplicate']
x = train.drop('isDuplicate', axis=1)

In [218]:
labels = train.isDuplicate[:34500]

In [219]:
train.drop('isDuplicate', axis=1, inplace=True)

### Manual split of training data 

In [220]:
train_s = train.iloc[:34500,:]
test_s = train.iloc[34500:,:]

In [221]:
xgtest = xgb.DMatrix(test_s)

In [222]:
len(train_s.iloc[split:,:])

10500

In [223]:
split = 4500

In [224]:
xgtrain = xgb.DMatrix(train_s.iloc[split:,:], label=labels[split:])
xgval = xgb.DMatrix(train_s.iloc[:split,:], label=labels[:split])

In [225]:
param = {'max_depth':3, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
num_round = 100
bst = xgb.train(param, xgtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=80)

[0]	train-error:0.28981	val-error:0.335111
Multiple eval metrics have been passed: 'val-error' will be used for early stopping.

Will train until val-error hasn't improved in 80 rounds.
[1]	train-error:0.284	val-error:0.310667
[2]	train-error:0.259714	val-error:0.263556
[3]	train-error:0.246667	val-error:0.260222
[4]	train-error:0.247619	val-error:0.261778
[5]	train-error:0.249048	val-error:0.260222
[6]	train-error:0.247905	val-error:0.261556
[7]	train-error:0.243524	val-error:0.257111
[8]	train-error:0.243048	val-error:0.258
[9]	train-error:0.241524	val-error:0.258222
[10]	train-error:0.242571	val-error:0.258
[11]	train-error:0.243238	val-error:0.258889
[12]	train-error:0.24	val-error:0.255556
[13]	train-error:0.238571	val-error:0.254889
[14]	train-error:0.237238	val-error:0.254667
[15]	train-error:0.236286	val-error:0.254889
[16]	train-error:0.232571	val-error:0.249333
[17]	train-error:0.230952	val-error:0.248
[18]	train-error:0.225048	val-error:0.236667
[19]	train-error:0.222286	val

In [226]:
preds = bst.predict(xgtest)
# len(preds)
preds = [round(value) for value in preds]

In [227]:
accuracy_score(y_test[:5000], preds)

ValueError: Found input variables with inconsistent numbers of samples: [5000, 0]

In [196]:
roc_auc_score(y_test[:5000], preds)

0.50375669827011327

In [173]:
xgb.XGBClassifier().get_xgb_params()

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': 1,
 'subsample': 1}

### Cross validation with XGBoost

In [170]:
bst_cv = xgb.cv(param, xgtrain, num_boost_round=1000, nfold=10, early_stopping_rounds=80, metrics=['auc', 'error'], stratified=True)

In [176]:
bst_cv

Unnamed: 0,test-auc-mean,test-auc-std,test-error-mean,test-error-std,train-auc-mean,train-auc-std,train-error-mean,train-error-std
0,0.758794,0.015767,0.289825,0.01804,0.760361,0.001767,0.28905,0.001969
1,0.802782,0.015924,0.267646,0.016321,0.808222,0.002354,0.262869,0.001761
2,0.818572,0.017488,0.268917,0.019483,0.82551,0.002065,0.260303,0.004287
3,0.822171,0.016184,0.267107,0.018521,0.835368,0.004684,0.253354,0.006308
4,0.832685,0.017789,0.251652,0.019065,0.844515,0.004521,0.243656,0.006782
5,0.842989,0.017593,0.244016,0.020183,0.855969,0.006981,0.232343,0.006827
6,0.850755,0.018323,0.240185,0.020815,0.864187,0.006505,0.223656,0.007023
7,0.856097,0.015921,0.232369,0.020996,0.872204,0.004592,0.215677,0.003877
8,0.861884,0.018637,0.22437,0.021199,0.877873,0.003055,0.208222,0.003812
9,0.86457,0.018169,0.223276,0.020401,0.882255,0.002456,0.204444,0.003035


In [228]:
bst_cv['test-auc-mean'].min()

0.75879380000000007

### Testing with train_test_split

In [8]:
seed = 1
test_size = 0.77
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = seed)

In [11]:
train1 = pd.concat([x_train, y_train], axis=1)

In [18]:
test1 = pd.concat([x_test, y_test], axis=1)

In [38]:
train_xgb = xgb.DMatrix(train1,label=y_train)
test_xgb = xgb.DMatrix(test1, label=y_test)

In [39]:
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
watchlist = [(test_xgb, 'eval'), (train_xgb, 'train')]

In [40]:
bst = xgb.train(param, train_xgb, num_round, watchlist)

[0]	eval-error:0	train-error:0
[1]	eval-error:0	train-error:0
