In [166]:
import pandas as pd

In [167]:
# gensim to pre process text
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [ 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [168]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=300, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [169]:
from sklearn.metrics import f1_score
import numpy as np

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    f1_score = f1_score(y_true, np.round(y_pred))
    return 'f1_score', f1_score

In [170]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [171]:
df.drop(['keyword', 'location'], axis=1, inplace=True)
df_test.drop(['keyword', 'location'], axis=1, inplace=True)

In [172]:
df_x = df['text']
df_y = df['target']

df_test_x = df_test['text']

In [173]:
doc2vec_trf = Doc2VecTransformer()

doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)

test_doc2vec_features = doc2vec_trf.fit(df_test_x).transform(df_test_x)

100%|██████████| 7613/7613 [00:00<00:00, 4091649.97it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4286647.38it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3922756.31it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4480319.40it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4627715.41it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4690252.11it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3973029.28it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4258065.92it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4326139.60it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4439209.84it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3862026.65it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4197060.51it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4347343.27it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4119627.96it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4477178.40it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4527326.86it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4541492.87it/s]
100%|██████████| 7613/7613 [00:

# HP Tuning with Grid-Search and K-fold CV

In [174]:
params = {
    'max_depth':10,
    'min_child_weight': 3,
    'eta':.05,
    'subsample': 1,
    'colsample_bytree': 0.3,
    'objective':'binary:logistic',
}

In [175]:
import xgboost as xgb
train_dmatrix = xgb.DMatrix(data=tfidf_vectors,label=df_y)

## Tune Complexity of decision trees

In [177]:
#tune max_depth and min_child_weight (complexity of decision trees)
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,7)
    for min_child_weight in range(1,4)
]

# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        train_dmatrix,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=3, min_child_weight=1
	MAE 0.371357 for 499 rounds, train-mae-mean: 0.3482546
CV with max_depth=3, min_child_weight=2
	MAE 0.37191300000000005 for 499 rounds, train-mae-mean: 0.34972060000000005
CV with max_depth=3, min_child_weight=3
	MAE 0.3716826 for 499 rounds, train-mae-mean: 0.35012999999999994
CV with max_depth=4, min_child_weight=1
	MAE 0.36179439999999996 for 499 rounds, train-mae-mean: 0.3300994
CV with max_depth=4, min_child_weight=2
	MAE 0.3622718 for 499 rounds, train-mae-mean: 0.332359
CV with max_depth=4, min_child_weight=3
	MAE 0.362144 for 499 rounds, train-mae-mean: 0.333619
CV with max_depth=5, min_child_weight=1
	MAE 0.3548028 for 499 rounds, train-mae-mean: 0.3147006
CV with max_depth=5, min_child_weight=2
	MAE 0.3553468 for 499 rounds, train-mae-mean: 0.31791800000000003
CV with max_depth=5, min_child_weight=3
	MAE 0.3555212 for 499 rounds, train-mae-mean: 0.3199236
CV with max_depth=6, min_child_weight=1
	MAE 0.3485202 for 499 rounds, train-mae-

In [178]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,9)
    for min_child_weight in range(1,2)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        train_dmatrix,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=6, min_child_weight=1
	MAE 0.3485202 for 499 rounds, train-mae-mean: 0.3010372
CV with max_depth=7, min_child_weight=1
	MAE 0.34390620000000005 for 499 rounds, train-mae-mean: 0.2886808
CV with max_depth=8, min_child_weight=1
	MAE 0.34000899999999995 for 499 rounds, train-mae-mean: 0.2777596


In [179]:
params['max_depth'] = 6
params['min_child_weight'] = 1

## Tune Sampling HP

In [180]:
#tune sampling h-p
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        train_dmatrix,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with subsample=1.0, colsample=1.0
	MAE 0.3463596 for 499 rounds, train-mae-mean: 0.2942164
CV with subsample=1.0, colsample=0.9
	MAE 0.34655620000000004 for 499 rounds, train-mae-mean: 0.2947392
CV with subsample=1.0, colsample=0.8
	MAE 0.3469364 for 499 rounds, train-mae-mean: 0.29530019999999996
CV with subsample=1.0, colsample=0.7
	MAE 0.34708560000000005 for 499 rounds, train-mae-mean: 0.296269
CV with subsample=0.9, colsample=1.0
	MAE 0.34313580000000005 for 499 rounds, train-mae-mean: 0.2871434
CV with subsample=0.9, colsample=0.9
	MAE 0.3432288 for 499 rounds, train-mae-mean: 0.28838220000000003
CV with subsample=0.9, colsample=0.8
	MAE 0.34339339999999996 for 499 rounds, train-mae-mean: 0.2893772
CV with subsample=0.9, colsample=0.7
	MAE 0.3438074 for 499 rounds, train-mae-mean: 0.2900218
CV with subsample=0.8, colsample=1.0
	MAE 0.3419082 for 499 rounds, train-mae-mean: 0.286242
CV with subsample=0.8, colsample=0.9
	MAE 0.34229980000000004 for 499 rounds, train-mae-mean: 0.

In [181]:
params['subsample'] = 0.8
params['colsample_bytree'] = 1.0

## Learning Rate (ETA) tuning

In [182]:
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run CV
    cv_results = xgb.cv(
        params,
        train_dmatrix,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with eta=0.3
	MAE 0.3062984 for 406 rounds, train-mae-mean: 0.15545340000000002
CV with eta=0.2
	MAE 0.30976440000000005 for 499 rounds, train-mae-mean: 0.17517699999999997
CV with eta=0.1
	MAE 0.3230392 for 499 rounds, train-mae-mean: 0.2361078
CV with eta=0.05
	MAE 0.3419082 for 499 rounds, train-mae-mean: 0.286242
CV with eta=0.01
	MAE 0.3910786 for 499 rounds, train-mae-mean: 0.3687856
CV with eta=0.005
	MAE 0.4125166 for 499 rounds, train-mae-mean: 0.3969176


In [183]:
params['eta'] = 0.05

# Train and Predict

In [184]:
clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, objective='binary:logistic', \
                      min_child_weight=1, subsample=0.8, colsample_bytree=1.0, n_estimators=200)

clf.fit(doc2vec_features, df_y, eval_metric=f1_eval)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [185]:
preds = clf.predict(test_doc2vec_features)
preds

array([1, 1, 0, ..., 1, 1, 0])

In [186]:
final_preds = pd.Series(preds)
test_ids = df_test['id']
df_preds = pd.concat([test_ids,final_preds],axis=1)
df_preds.columns = ['id', 'target']

In [187]:
df_preds.to_csv('XGBoost_submission.csv')
df_preds.tail()

Unnamed: 0,id,target
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
3262,10875,0
