In [1]:
%matplotlib inline

import datetime

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import xgboost as xgb

## A: Start

In [11]:
print("start")
df_train = pd.read_pickle('/mnt/sframe/df_train')
df_test = pd.read_pickle('/mnt/sframe/df_test')

df_train['text'].fillna('', inplace=True)
df_test['text'].fillna('', inplace=True)

df_train['meta_names'].fillna('', inplace=True)
df_test['meta_names'].fillna('', inplace=True)

for f in [u'viewport.0', u'viewport.1',
       u'viewport.2', u'links_profile.bad_domain',
       u'links_profile.candidate_rank', u'links_profile.candidate_ratio',
       u'links_profile.candidate_score', u'links_profile.candidate_site',
       u'links_profile.domain_set_size', u'links_profile.no_domain',
       u'links_profile.total_ratio']:
    df_train[f].fillna(0, inplace=True)
    df_test[f].fillna(0, inplace=True)

start


In [9]:
df_train.columns

Index([u'sponsored', u'id', u'text', u'num_words', u'a_href', u'par', u'title',
       u'img', u'btn', u'input', u'li', u'link', u'meta', u'script_avg',
       u'script_b_avg', u'script_cnt', u'script_b_cnt', u'style_avg',
       u'style_cnt', u'fb_id', u'ga_cnt', u'ga_id', u'ga_len', u'ga_line',
       u'ga_subid', u'word_set_size', u'word_set_size_ratio',
       u'word_set_size2', u'word_set_size2_ratio', u'shinn.braces',
       u'shinn.brackets', u'shinn.cheap', u'shinn.col', u'shinn.commas',
       u'shinn.get', u'shinn.length', u'shinn.lines', u'shinn.parens',
       u'shinn.periods', u'shinn.sale', u'shinn.semi', u'shinn.shop',
       u'shinn.spaces', u'shinn.tabs', u'shinn.uppers', u'shinn.words',
       u'shinn.you', u'basic.0', u'basic.1', u'basic.2', u'basic.3',
       u'basic.4', u'basic.5', u'meta_names', u'viewport.0', u'viewport.1',
       u'viewport.2', u'links_profile.bad_domain',
       u'links_profile.candidate_rank', u'links_profile.candidate_ratio',
       u'links_p

In [12]:
df_train[df_train['links_profile.total_ratio'].isnull()].shape

(0, 65)

### B.1: test

In [None]:
print("split")
train_train, train_cv = train_test_split(df_train, test_size=0.2, random_state=107)

In [13]:
TRAIN = train_train
CV = train_cv

split


### B.2: deploy

In [26]:
TRAIN = df_train
CV = df_test

## C: Model

In [27]:
print("BOW")
count_vect = CountVectorizer(stop_words='english', min_df=20)
TRAIN_counts = count_vect.fit_transform(TRAIN['text'])
CV_counts = count_vect.transform(CV['text'])
print("shape TRAIN_counts: %s" % (str(TRAIN_counts.shape)))

BOW
shape TRAIN_counts: (337024, 108062)


In [28]:
print("BOW meta")
def my_tokenizer(s):
    return s.split(',')
count_vect_meta = CountVectorizer(tokenizer=my_tokenizer, min_df=10)
TRAIN_meta_counts = count_vect_meta.fit_transform(TRAIN['meta_names'])
CV_meta_counts = count_vect_meta.transform(CV['meta_names'])
print("shape TRAIN_meta_counts: %s" % (str(TRAIN_meta_counts.shape)))

BOW meta
shape TRAIN_meta_counts: (337024, 3898)


In [29]:
print("TF-IDF")
tfidf_transformer = TfidfTransformer()
TRAIN_tfidf = tfidf_transformer.fit_transform(TRAIN_counts)
CV_tfidf = tfidf_transformer.transform(CV_counts)
print("shape TRAIN_tfidf: %s" % (str(TRAIN_tfidf.shape)))

TF-IDF
shape TRAIN_tfidf: (337024, 108062)


In [30]:
print("Combine features")
bad_cols = [col for col in list(TRAIN.columns) 
            if col.startswith('word2vec') or col in set(['text_words', 'text', 'id', 'sponsored', 'meta_names'])]

DF_TRAIN = TRAIN.drop(bad_cols, 1).fillna(0)
DF_CV = CV.drop(bad_cols, 1).fillna(0)

DF_TRAIN = DF_TRAIN.astype('float64')
DF_CV = DF_CV.astype('float64')

DF_DF_TRAIN = sp.hstack( [TRAIN_meta_counts, TRAIN_tfidf,  sp.csr_matrix(np.array(DF_TRAIN)) ] )
DF_DF_CV = sp.hstack( [CV_meta_counts, CV_tfidf,  sp.csr_matrix(np.array(DF_CV)) ] )

print("shape DF_TRAIN: %s" % (str(DF_TRAIN.shape)))

Combine features
shape DF_TRAIN: (337024, 61)


In [31]:
print("shape DF_DF_TRAIN: %s" % (str(DF_DF_TRAIN.shape)))

shape DF_DF_TRAIN: (337024, 112021)


## D: Combine

### D.1: test

In [21]:
dtrain = xgb.DMatrix(DF_DF_TRAIN, label = TRAIN['sponsored'])
dtest = xgb.DMatrix(DF_DF_CV, label = CV['sponsored'])
watchlist  = [(dtest,'eval'), (dtrain,'train')]

In [22]:
dtrain.save_binary('dtrain_train.buffer_mindf20')
dtest.save_binary('dtrain_cv.buffer_mindf20')

In [34]:
dtrain = xgb.DMatrix('dtrain_train.buffer_mindf20')
dtest = xgb.DMatrix('dtrain_cv.buffer_mindf20')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
dtrain.num_row(), dtest.num_row()

(269619L, 67405L)

### D.2: deploy

In [32]:
dtrain = xgb.DMatrix(DF_DF_TRAIN, label = TRAIN['sponsored'])
dtest = xgb.DMatrix(DF_DF_CV)
watchlist  = [(dtrain,'train')]

In [33]:
# Save for later:
dtrain.save_binary('orig_dtrain.buffer_mindf20')
dtest.save_binary('orig_dtest.buffer_mindf20')

In [None]:
dtrain = xgb.DMatrix('orig_dtrain.buffer_mindf20')
dtest = xgb.DMatrix('orig_dtest.buffer_mindf20')
watchlist  = [(dtrain,'train')]
dtrain.num_row(), dtest.num_row()

---

## E: Params

---

## M-base-version

In [23]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
# match g
# max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0 
param = {'max_depth': 17, 
         'eta': 0.05, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 800

print(datetime.datetime.now())

2015-10-09 08:01:23.839742


In [48]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
# match g
# max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0 
param = {'max_depth': 18, 
         'eta': 0.01, 
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         #'min_child_weight': 15,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 200

print(datetime.datetime.now())

2015-10-09 08:36:56.302354


In [46]:
bst = xgb.train(param, dtrain, num_round, watchlist) #0.5

[0]	eval-auc:0.725346	train-auc:0.735978
[1]	eval-auc:0.771739	train-auc:0.789144
[2]	eval-auc:0.787437	train-auc:0.803569
[3]	eval-auc:0.804280	train-auc:0.821485
[4]	eval-auc:0.809663	train-auc:0.827755
[5]	eval-auc:0.821946	train-auc:0.841887
[6]	eval-auc:0.823833	train-auc:0.843401
[7]	eval-auc:0.825634	train-auc:0.845887
[8]	eval-auc:0.828254	train-auc:0.848978
[9]	eval-auc:0.830688	train-auc:0.852131
[10]	eval-auc:0.830293	train-auc:0.851779
[11]	eval-auc:0.830594	train-auc:0.852338
[12]	eval-auc:0.829838	train-auc:0.851645
[13]	eval-auc:0.832120	train-auc:0.854238
[14]	eval-auc:0.831608	train-auc:0.853528
[15]	eval-auc:0.831258	train-auc:0.853727
[16]	eval-auc:0.831494	train-auc:0.854315
[17]	eval-auc:0.832392	train-auc:0.855494
[18]	eval-auc:0.834110	train-auc:0.858004
[19]	eval-auc:0.834666	train-auc:0.858457
[20]	eval-auc:0.835382	train-auc:0.859379
[21]	eval-auc:0.836420	train-auc:0.860188
[22]	eval-auc:0.835976	train-auc:0.859704
[23]	eval-auc:0.836558	train-auc:0.860223
[2

KeyboardInterrupt: 

In [40]:
bst = xgb.train(param, dtrain, num_round, watchlist)  #0.4

[0]	eval-auc:0.725346	train-auc:0.735978
[1]	eval-auc:0.771739	train-auc:0.789144
[2]	eval-auc:0.796940	train-auc:0.813029
[3]	eval-auc:0.803177	train-auc:0.818240
[4]	eval-auc:0.807779	train-auc:0.823322
[5]	eval-auc:0.812233	train-auc:0.828694
[6]	eval-auc:0.817412	train-auc:0.834790
[7]	eval-auc:0.818460	train-auc:0.836556
[8]	eval-auc:0.821835	train-auc:0.840497
[9]	eval-auc:0.820524	train-auc:0.839721
[10]	eval-auc:0.822254	train-auc:0.842438
[11]	eval-auc:0.825392	train-auc:0.847585
[12]	eval-auc:0.825772	train-auc:0.847922
[13]	eval-auc:0.827668	train-auc:0.849291
[14]	eval-auc:0.829314	train-auc:0.850923
[15]	eval-auc:0.830189	train-auc:0.852203
[16]	eval-auc:0.832998	train-auc:0.855660
[17]	eval-auc:0.833215	train-auc:0.855924
[18]	eval-auc:0.835164	train-auc:0.857554
[19]	eval-auc:0.834785	train-auc:0.857204
[20]	eval-auc:0.834745	train-auc:0.857545
[21]	eval-auc:0.836459	train-auc:0.860131
[22]	eval-auc:0.837202	train-auc:0.861413
[23]	eval-auc:0.838766	train-auc:0.863654
[2

KeyboardInterrupt: 

In [42]:
bst = xgb.train(param, dtrain, num_round, watchlist) #0.3

[0]	eval-auc:0.729769	train-auc:0.743844
[1]	eval-auc:0.770522	train-auc:0.790120
[2]	eval-auc:0.780034	train-auc:0.800657
[3]	eval-auc:0.803350	train-auc:0.821476
[4]	eval-auc:0.806560	train-auc:0.825656
[5]	eval-auc:0.811846	train-auc:0.829909
[6]	eval-auc:0.811101	train-auc:0.830896
[7]	eval-auc:0.816652	train-auc:0.837054
[8]	eval-auc:0.815831	train-auc:0.837278
[9]	eval-auc:0.816979	train-auc:0.837958
[10]	eval-auc:0.820713	train-auc:0.842588
[11]	eval-auc:0.822022	train-auc:0.843522
[12]	eval-auc:0.822302	train-auc:0.844085
[13]	eval-auc:0.828519	train-auc:0.851191
[14]	eval-auc:0.828639	train-auc:0.850900
[15]	eval-auc:0.832367	train-auc:0.855613
[16]	eval-auc:0.832864	train-auc:0.856781
[17]	eval-auc:0.835240	train-auc:0.859067
[18]	eval-auc:0.835793	train-auc:0.860321
[19]	eval-auc:0.834950	train-auc:0.859506
[20]	eval-auc:0.836435	train-auc:0.861130
[21]	eval-auc:0.839450	train-auc:0.863966
[22]	eval-auc:0.841697	train-auc:0.866013
[23]	eval-auc:0.842129	train-auc:0.866310
[2

KeyboardInterrupt: 

In [36]:
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	eval-auc:0.725346	train-auc:0.735978
[1]	eval-auc:0.791019	train-auc:0.817665
[2]	eval-auc:0.829401	train-auc:0.865907
[3]	eval-auc:0.856240	train-auc:0.902103
[4]	eval-auc:0.878392	train-auc:0.940189
[5]	eval-auc:0.895176	train-auc:0.963277
[6]	eval-auc:0.906432	train-auc:0.975436
[7]	eval-auc:0.913355	train-auc:0.982984
[8]	eval-auc:0.918611	train-auc:0.988754
[9]	eval-auc:0.921459	train-auc:0.991592
[10]	eval-auc:0.923668	train-auc:0.992822
[11]	eval-auc:0.925038	train-auc:0.993741
[12]	eval-auc:0.925857	train-auc:0.994206
[13]	eval-auc:0.926644	train-auc:0.994610
[14]	eval-auc:0.927549	train-auc:0.995402
[15]	eval-auc:0.928319	train-auc:0.995672
[16]	eval-auc:0.928710	train-auc:0.995926
[17]	eval-auc:0.929151	train-auc:0.996104
[18]	eval-auc:0.929450	train-auc:0.996254
[19]	eval-auc:0.929864	train-auc:0.996386
[20]	eval-auc:0.930132	train-auc:0.996631
[21]	eval-auc:0.930153	train-auc:0.996905
[22]	eval-auc:0.931099	train-auc:0.997078
[23]	eval-auc:0.931638	train-auc:0.997151
[2

KeyboardInterrupt: 

In [None]:
print(datetime.datetime.now())
preds = bst.predict(dtest) #, ntree_limit=bst.boost .best_iteration)

In [None]:
roc_auc_score(CV['sponsored'], preds)

---

## M-version

In [None]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 18, 
         'eta': 0.02, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 1800

print(datetime.datetime.now())

In [None]:
bst2 = xgb.train(param, dtrain, num_round, watchlist)

In [None]:
print(datetime.datetime.now())
preds = bst2.predict(dtest) #, ntree_limit=bst.boost .best_iteration)

---

## M2-version

In [None]:
param = {'max_depth': 20, 
         'eta': 1, 
         'num_parallel_tree': 10,
         #'gamma': 1.0,
         'colsample_bytree': 0.8,
         'subsample': 1.0,
         'min_child_weight': 10,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 20

print(datetime.datetime.now())

In [None]:
bst2 = xgb.train(param, dtrain, num_round, watchlist)

In [None]:
a = datetime.datetime.now()

#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
#max depth [19], eta 0.02, colsample_bytree 0.6, subsample: 1.0, min_child_weight: 7
param = {'max_depth': 19, 
         'eta': 0.1, 
         'num_parallel_tree': 10,
         #'gamma': 1.0,
         'colsample_bytree': 0.5, #0.8,
         'subsample': 1.0,
         'min_child_weight': 1, #10,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 300

print(datetime.datetime.now())
b = datetime.datetime.now() - a
print(b)

In [None]:
a = datetime.datetime.now()
%time bst2 = xgb.train(param, dtrain, num_round, watchlist)
print(datetime.datetime.now())
b = datetime.datetime.now() - a
print(b)

In [None]:
bst2 = xgb.train(param, dtrain, num_round, watchlist)

In [None]:
print(datetime.datetime.now())
preds = bst2.predict(dtest) #, ntree_limit=bst.boost .best_iteration)

In [None]:
3

# Submission

In [None]:
submission = pd.DataFrame({
        'file': df_test['id'].map(lambda s: s + '_raw_html.txt').reset_index(drop=True),
        'sponsored': preds
    })
submission.to_csv('submission_xgb_Oct_07_01.csv', index=False)
print(datetime.datetime.now())

In [None]:
#submission = df_test[['id']].map(lambda s: s + '_raw_html.txt'). reset_index(drop=True)
#submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]
#submission.to_csv('submission_xgb_Oct_06_01.csv', index=False)

# train_train version

In [None]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 18, 
         'eta': 0.02, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 1800

print(datetime.datetime.now())

In [None]:
bst = xgb.train(param, dtrain, num_round, watchlist)

In [None]:
print(datetime.datetime.now())
preds = bst.predict(dtest) #, ntree_limit=bst.boost .best_iteration)
roc_auc_score(CV['sponsored'], preds)

# T

In [None]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 16, 
         'eta': 0.1, 
         #'gamma': 1.0,
         'colsample_bytree': 0.5,
         'subsample': 1.0,
         'min_child_weight': 5,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 300

print(datetime.datetime.now())

In [None]:
3

In [None]:
dtrain.save_binary('dtrain.buffer_mindf20')
dtest.save_binary('dtest.buffer_mindf20')
#a = xgb.DMatrix('dtest.buffer')

In [None]:
submission = test[['file']].reset_index(drop=True)
submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]

## etc

In [None]:
df_train.columns

In [None]:
np.mean(df_train[df_train['basic.2'] == 0]['sponsored'])

In [None]:
df_train['basic.1'].value_counts()

In [None]:
a= bst.get_fscore()

In [None]:
len(a)

In [None]:
pd.read_csv('~/tmp/')