In [1]:
%matplotlib inline

import datetime

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import xgboost as xgb

## A: Start

In [2]:
print("start")
df_train = pd.read_pickle('/mnt/sframe/df_train')
df_test = pd.read_pickle('/mnt/sframe/df_test')

df_train['text'].fillna('', inplace=True)
df_test['text'].fillna('', inplace=True)

start


### B.1: test

In [18]:
print("split")
train_train, train_cv = train_test_split(df_train, test_size=0.2, random_state=107)

TRAIN = train_train
CV = train_cv

split


### B.2: deploy

In [3]:
TRAIN = df_train
CV = df_test

## C: Model

In [19]:
print("BOW")
count_vect = CountVectorizer(stop_words='english', min_df=20)
TRAIN_counts = count_vect.fit_transform(TRAIN['text'])
CV_counts = count_vect.transform(CV['text'])
print("shape TRAIN_counts: %s" % (str(TRAIN_counts.shape)))

print("TF-IDF")
tfidf_transformer = TfidfTransformer()
TRAIN_tfidf = tfidf_transformer.fit_transform(TRAIN_counts)
CV_tfidf = tfidf_transformer.transform(CV_counts)
print("shape TRAIN_tfidf: %s" % (str(TRAIN_tfidf.shape)))

BOW
shape TRAIN_counts: (269619, 92867)
TF-IDF
shape TRAIN_tfidf: (269619, 92867)


In [20]:
print("Combine features")
bad_cols = [col for col in list(TRAIN.columns) 
            if col.startswith('word2vec') or col in set(['text_words', 'text', 'id', 'sponsored'])]

DF_TRAIN = TRAIN.drop(bad_cols, 1).fillna(0)
DF_CV = CV.drop(bad_cols, 1).fillna(0)

DF_TRAIN = DF_TRAIN.astype('float64')
DF_CV = DF_CV.astype('float64')

DF_DF_TRAIN = sp.hstack( [TRAIN_tfidf,  sp.csr_matrix(np.array(DF_TRAIN)) ] )
DF_DF_CV = sp.hstack( [CV_tfidf,  sp.csr_matrix(np.array(DF_CV)) ] )

print("shape DF_TRAIN: %s" % (str(DF_TRAIN.shape)))

Combine features
shape DF_TRAIN: (269619, 50)


In [21]:
print("shape DF_DF_TRAIN: %s" % (str(DF_DF_TRAIN.shape)))

shape DF_DF_TRAIN: (269619, 92917)


## D: Combine

### D.1: test

In [22]:
dtrain = xgb.DMatrix(DF_DF_TRAIN, label = TRAIN['sponsored'])
dtest = xgb.DMatrix(DF_DF_CV, label = CV['sponsored'])
watchlist  = [(dtest,'eval'), (dtrain,'train')]

In [23]:
dtrain.save_binary('dtrain_train.buffer_mindf20')
dtest.save_binary('dtrain_cv.buffer_mindf20')

### D.2: deploy

In [8]:
dtrain = xgb.DMatrix(DF_DF_TRAIN, label = TRAIN['sponsored'])
dtest = xgb.DMatrix(DF_DF_CV)
watchlist  = [(dtrain,'train')]

In [9]:
# Save for later:
dtrain.save_binary('orig_dtrain.buffer_mindf20')
dtest.save_binary('orig_dtest.buffer_mindf20')

## E: Params

In [10]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 17, 
         'eta': 0.05, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 800

print(datetime.datetime.now())

2015-10-06 16:41:57.080669


In [11]:
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	train-auc:0.750854
[1]	train-auc:0.803894
[2]	train-auc:0.809665
[3]	train-auc:0.818385
[4]	train-auc:0.827795
[5]	train-auc:0.830302
[6]	train-auc:0.834035
[7]	train-auc:0.839013
[8]	train-auc:0.844391
[9]	train-auc:0.848326
[10]	train-auc:0.851132
[11]	train-auc:0.852845
[12]	train-auc:0.854122
[13]	train-auc:0.857297
[14]	train-auc:0.860236
[15]	train-auc:0.866022
[16]	train-auc:0.871069
[17]	train-auc:0.874039
[18]	train-auc:0.878415
[19]	train-auc:0.885498
[20]	train-auc:0.891266
[21]	train-auc:0.893285
[22]	train-auc:0.898215
[23]	train-auc:0.900397
[24]	train-auc:0.906033
[25]	train-auc:0.911527
[26]	train-auc:0.914070
[27]	train-auc:0.915713
[28]	train-auc:0.918840
[29]	train-auc:0.920841
[30]	train-auc:0.924216
[31]	train-auc:0.927562
[32]	train-auc:0.929592
[33]	train-auc:0.931161
[34]	train-auc:0.932303
[35]	train-auc:0.934801
[36]	train-auc:0.936846
[37]	train-auc:0.937568
[38]	train-auc:0.939899
[39]	train-auc:0.941443
[40]	train-auc:0.943318
[41]	train-auc:0.944972
[4

In [12]:
print(datetime.datetime.now())
preds = bst.predict(dtest) #, ntree_limit=bst.boost .best_iteration)

2015-10-06 17:22:42.101027


In [None]:
roc_auc_score(CV['sponsored'], preds)

# M-version

In [14]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 18, 
         'eta': 0.02, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 1800

print(datetime.datetime.now())

2015-10-06 17:27:47.087603


In [15]:
bst2 = xgb.train(param, dtrain, num_round, watchlist)

[0]	train-auc:0.753641
[1]	train-auc:0.806111
[2]	train-auc:0.813202
[3]	train-auc:0.818390
[4]	train-auc:0.820711
[5]	train-auc:0.823589
[6]	train-auc:0.826262
[7]	train-auc:0.832258
[8]	train-auc:0.840550
[9]	train-auc:0.841218
[10]	train-auc:0.843315
[11]	train-auc:0.850759
[12]	train-auc:0.853212
[13]	train-auc:0.854130
[14]	train-auc:0.853376
[15]	train-auc:0.854256
[16]	train-auc:0.856045
[17]	train-auc:0.858091
[18]	train-auc:0.858172
[19]	train-auc:0.859221
[20]	train-auc:0.860937
[21]	train-auc:0.861455
[22]	train-auc:0.862858
[23]	train-auc:0.863915
[24]	train-auc:0.865515
[25]	train-auc:0.866393
[26]	train-auc:0.868082
[27]	train-auc:0.869158
[28]	train-auc:0.870912
[29]	train-auc:0.872232
[30]	train-auc:0.873330
[31]	train-auc:0.874035
[32]	train-auc:0.875418
[33]	train-auc:0.876770
[34]	train-auc:0.878076
[35]	train-auc:0.879097
[36]	train-auc:0.880590
[37]	train-auc:0.882467
[38]	train-auc:0.882881
[39]	train-auc:0.883262
[40]	train-auc:0.885893
[41]	train-auc:0.887358
[4

In [16]:
print(datetime.datetime.now())
preds = bst2.predict(dtest) #, ntree_limit=bst.boost .best_iteration)

2015-10-06 19:04:59.720819


In [None]:
3

# train_train version

In [24]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 18, 
         'eta': 0.02, 
         #'gamma': 1.0,
         'colsample_bytree': 0.6,
         'subsample': 1.0,
         'min_child_weight': 7,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 1800

print(datetime.datetime.now())

2015-10-06 20:43:10.802569


In [25]:
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	eval-auc:0.744826	train-auc:0.761493
[1]	eval-auc:0.754657	train-auc:0.770514
[2]	eval-auc:0.779679	train-auc:0.796101
[3]	eval-auc:0.798093	train-auc:0.813978
[4]	eval-auc:0.809853	train-auc:0.826300
[5]	eval-auc:0.809788	train-auc:0.827464
[6]	eval-auc:0.815373	train-auc:0.832781
[7]	eval-auc:0.816828	train-auc:0.835233
[8]	eval-auc:0.819575	train-auc:0.837900
[9]	eval-auc:0.825654	train-auc:0.844266
[10]	eval-auc:0.827418	train-auc:0.845345
[11]	eval-auc:0.829289	train-auc:0.847512
[12]	eval-auc:0.831424	train-auc:0.850430
[13]	eval-auc:0.831493	train-auc:0.850895
[14]	eval-auc:0.832367	train-auc:0.851641
[15]	eval-auc:0.835679	train-auc:0.855417
[16]	eval-auc:0.837455	train-auc:0.857265
[17]	eval-auc:0.837568	train-auc:0.857965
[18]	eval-auc:0.839103	train-auc:0.859415
[19]	eval-auc:0.840155	train-auc:0.860493
[20]	eval-auc:0.842316	train-auc:0.862416
[21]	eval-auc:0.842534	train-auc:0.862929
[22]	eval-auc:0.843831	train-auc:0.864672
[23]	eval-auc:0.843699	train-auc:0.864810
[2

In [26]:
print(datetime.datetime.now())
preds = bst.predict(dtest) #, ntree_limit=bst.boost .best_iteration)
roc_auc_score(CV['sponsored'], preds)

2015-10-06 22:02:10.868004


0.95370667184361868

# Submission

In [17]:
submission = pd.DataFrame({
        'file': df_test['id'].map(lambda s: s + '_raw_html.txt').reset_index(drop=True),
        'sponsored': preds
    })
submission.to_csv('submission_xgb_Oct_06_02.csv', index=False)
print(datetime.datetime.now())

2015-10-06 19:05:01.516262


In [None]:
#submission = df_test[['id']].map(lambda s: s + '_raw_html.txt'). reset_index(drop=True)
#submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]
#submission.to_csv('submission_xgb_Oct_06_01.csv', index=False)

# T

In [None]:
#max depth [8], eta 0.2, colsample_bytree 0.5, subsample: 1.0
param = {'max_depth': 16, 
         'eta': 0.1, 
         #'gamma': 1.0,
         'colsample_bytree': 0.5,
         'subsample': 1.0,
         'min_child_weight': 5,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 300

print(datetime.datetime.now())

In [27]:
3

3

In [None]:
dtrain.save_binary('dtrain.buffer_mindf20')
dtest.save_binary('dtest.buffer_mindf20')
#a = xgb.DMatrix('dtest.buffer')

In [None]:
submission = test[['file']].reset_index(drop=True)
submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]

## etc

In [33]:
df_train.columns

Index([u'sponsored', u'id', u'text', u'num_words', u'a_href', u'par', u'title',
       u'img', u'btn', u'input', u'li', u'link', u'meta', u'script_avg',
       u'script_b_avg', u'script_cnt', u'script_b_cnt', u'style_avg',
       u'style_cnt', u'fb_id', u'ga_cnt', u'ga_id', u'ga_len', u'ga_line',
       u'ga_subid', u'word_set_size', u'word_set_size_ratio',
       u'word_set_size2', u'word_set_size2_ratio', u'shinn.braces',
       u'shinn.brackets', u'shinn.cheap', u'shinn.col', u'shinn.commas',
       u'shinn.get', u'shinn.length', u'shinn.lines', u'shinn.parens',
       u'shinn.periods', u'shinn.sale', u'shinn.semi', u'shinn.shop',
       u'shinn.spaces', u'shinn.tabs', u'shinn.uppers', u'shinn.words',
       u'shinn.you', u'basic.0', u'basic.1', u'basic.2', u'basic.3',
       u'basic.4', u'basic.5'],
      dtype='object')

In [40]:
np.mean(df_train[df_train['basic.2'] == 0]['sponsored'])

0.080041350390369359

In [35]:
df_train['basic.1'].value_counts()

0    259265
1     77759
dtype: int64

In [43]:
a= bst.get_fscore()

In [47]:
len(a)

5863