In [107]:
import pandas as pd

In [108]:
df = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')

In [109]:
df.drop(['keyword', 'location'], axis=1, inplace=True)
df_test.drop(['keyword', 'location'], axis=1, inplace=True)

In [110]:
df_x = df['text']
df_y = df['target']

df_test_x = df_test['text']

## Vamos a probar distintos word-embeddings

### 1. Doc2Vec

In [111]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=300, learning_rate=0.1, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [112]:
doc2vec_trf = Doc2VecTransformer()

doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)

test_doc2vec_features = doc2vec_trf.fit(df_test_x).transform(df_test_x)

100%|██████████| 7613/7613 [00:00<00:00, 4287222.93it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4281474.44it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4542785.08it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3356942.43it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3925649.91it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4460292.83it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4507515.01it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4266030.24it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3826849.99it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3146864.72it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4296452.68it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4464034.16it/s]
100%|██████████| 7613/7613 [00:00<00:00, 2935665.75it/s]
100%|██████████| 7613/7613 [00:00<00:00, 888484.27it/s]
100%|██████████| 7613/7613 [00:00<00:00, 1828926.99it/s]
100%|██████████| 7613/7613 [00:00<00:00, 2283575.51it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4567477.66it/s]
100%|██████████| 7613/7613 [00:0

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc2vec_features, df_y, test_size=.1, random_state=42)

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [114]:
from sklearn.metrics import f1_score
import numpy as np

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    f1_score_value = f1_score(y_true, np.round(y_pred))
    return 'f1_score', f1_score_value

In [123]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.1,
    'subsample': 0.8,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'binary:logistic',
}

In [116]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    feval=f1_eval,
    maximize=True
)

print("Best F1 score: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-error:0.46063	Test-f1_score:0.13333
Multiple eval metrics have been passed: 'Test-f1_score' will be used for early stopping.

Will train until Test-f1_score hasn't improved in 10 rounds.
[1]	Test-error:0.47900	Test-f1_score:0.26263
[2]	Test-error:0.46457	Test-f1_score:0.25000
[3]	Test-error:0.46457	Test-f1_score:0.25000
[4]	Test-error:0.48688	Test-f1_score:0.21895
[5]	Test-error:0.48819	Test-f1_score:0.21849
[6]	Test-error:0.49475	Test-f1_score:0.21294
[7]	Test-error:0.48556	Test-f1_score:0.21277
[8]	Test-error:0.46719	Test-f1_score:0.24576
[9]	Test-error:0.45800	Test-f1_score:0.24946
[10]	Test-error:0.47244	Test-f1_score:0.23404
[11]	Test-error:0.46325	Test-f1_score:0.23427
Stopping. Best iteration:
[1]	Test-error:0.47900	Test-f1_score:0.26263

Best F1 score: 0.26 with 2 rounds


### 1.1 Doc2Vec with text cleaning

In [117]:
# gensim to pre process text
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [ 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [118]:
df_x_cleaned = df_x.apply(lambda x: clean_text(x))
df_test_x_cleaned = df_test_x.apply(lambda x: clean_text(x))

In [119]:
doc2vec_trf = Doc2VecTransformer()

doc2vec_features = doc2vec_trf.fit(df_x_cleaned).transform(df_x_cleaned)

test_doc2vec_features = doc2vec_trf.fit(df_test_x_cleaned).transform(df_test_x_cleaned)

100%|██████████| 7613/7613 [00:00<00:00, 973542.98it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4450346.53it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4459669.88it/s]
100%|██████████| 7613/7613 [00:00<00:00, 3248015.09it/s]
100%|██████████| 7613/7613 [00:00<00:00, 907730.52it/s]
100%|██████████| 7613/7613 [00:00<00:00, 902981.63it/s]
100%|██████████| 7613/7613 [00:00<00:00, 929260.12it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4706151.27it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4637797.58it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4295874.66it/s]
100%|██████████| 7613/7613 [00:00<00:00, 927775.12it/s]
100%|██████████| 7613/7613 [00:00<00:00, 949345.51it/s]
100%|██████████| 7613/7613 [00:00<00:00, 926348.60it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4275741.34it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4267170.43it/s]
100%|██████████| 7613/7613 [00:00<00:00, 2145195.59it/s]
100%|██████████| 7613/7613 [00:00<00:00, 1395962.07it/s]
100%|██████████| 7613/7613 [00:00<00:0

In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc2vec_features, df_y, test_size=.1, random_state=42)

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [121]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    feval=f1_eval,
    maximize=True
)

print("Best F1 score: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-error:0.48556	Test-f1_score:0.35986
Multiple eval metrics have been passed: 'Test-f1_score' will be used for early stopping.

Will train until Test-f1_score hasn't improved in 10 rounds.
[1]	Test-error:0.46325	Test-f1_score:0.35701
[2]	Test-error:0.44619	Test-f1_score:0.35361
[3]	Test-error:0.43176	Test-f1_score:0.37807
[4]	Test-error:0.45538	Test-f1_score:0.32621
[5]	Test-error:0.46588	Test-f1_score:0.28571
[6]	Test-error:0.47244	Test-f1_score:0.26829
[7]	Test-error:0.46063	Test-f1_score:0.27629
[8]	Test-error:0.47244	Test-f1_score:0.23077
[9]	Test-error:0.47769	Test-f1_score:0.23529
[10]	Test-error:0.47244	Test-f1_score:0.24686
[11]	Test-error:0.47769	Test-f1_score:0.25103
[12]	Test-error:0.47113	Test-f1_score:0.26283
[13]	Test-error:0.46850	Test-f1_score:0.26694
Stopping. Best iteration:
[3]	Test-error:0.43176	Test-f1_score:0.37807

Best F1 score: 0.38 with 4 rounds


# HP Tuning with Grid-Search and K-fold CV

## Tune Complexity of decision trees

In [124]:
#tune max_depth and min_child_weight (complexity of decision trees)
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,7)
    for min_child_weight in range(1,4)
]

# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=3, min_child_weight=1
	MAE 0.48887080000000005 for 41 rounds, train-mae-mean: 0.4551996
CV with max_depth=3, min_child_weight=2
	MAE 0.486352 for 125 rounds, train-mae-mean: 0.4010116
CV with max_depth=3, min_child_weight=3
	MAE 0.4877488 for 78 rounds, train-mae-mean: 0.42977419999999994
CV with max_depth=4, min_child_weight=1
	MAE 0.4861794 for 96 rounds, train-mae-mean: 0.3579384
CV with max_depth=4, min_child_weight=2
	MAE 0.4865176 for 97 rounds, train-mae-mean: 0.35888480000000006
CV with max_depth=4, min_child_weight=3
	MAE 0.4887662 for 35 rounds, train-mae-mean: 0.43413379999999996
CV with max_depth=5, min_child_weight=1
	MAE 0.483174 for 121 rounds, train-mae-mean: 0.25035180000000007
CV with max_depth=5, min_child_weight=2
	MAE 0.48436979999999996 for 96 rounds, train-mae-mean: 0.28997
CV with max_depth=5, min_child_weight=3
	MAE 0.4852314 for 90 rounds, train-mae-mean: 0.30169159999999995
CV with max_depth=6, min_child_weight=1
	MAE 0.4776454 for 202 round

In [125]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,9)
    for min_child_weight in range(1,2)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=6, min_child_weight=1
	MAE 0.4776454 for 202 rounds, train-mae-mean: 0.0984412
CV with max_depth=7, min_child_weight=1
	MAE 0.4729013999999999 for 228 rounds, train-mae-mean: 0.048008999999999996
CV with max_depth=8, min_child_weight=1
	MAE 0.47547700000000004 for 125 rounds, train-mae-mean: 0.0722512


In [126]:
params['max_depth'] = 7
params['min_child_weight'] = 1

## Tune Sampling HP

In [127]:
#tune sampling h-p
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with subsample=1.0, colsample=1.0
	MAE 0.4828682000000001 for 53 rounds, train-mae-mean: 0.22922159999999997
CV with subsample=1.0, colsample=0.9
	MAE 0.47716519999999996 for 140 rounds, train-mae-mean: 0.08717280000000001
CV with subsample=1.0, colsample=0.8
	MAE 0.4761062000000001 for 179 rounds, train-mae-mean: 0.06419640000000001
CV with subsample=1.0, colsample=0.7
	MAE 0.4726952 for 274 rounds, train-mae-mean: 0.0340418
CV with subsample=0.9, colsample=1.0
	MAE 0.47277199999999997 for 240 rounds, train-mae-mean: 0.041466
CV with subsample=0.9, colsample=0.9
	MAE 0.47295040000000005 for 248 rounds, train-mae-mean: 0.039327999999999995
CV with subsample=0.9, colsample=0.8
	MAE 0.47799040000000004 for 118 rounds, train-mae-mean: 0.11555439999999999
CV with subsample=0.9, colsample=0.7
	MAE 0.4824878 for 66 rounds, train-mae-mean: 0.20530279999999998
CV with subsample=0.8, colsample=1.0
	MAE 0.4729013999999999 for 228 rounds, train-mae-mean: 0.048008999999999996
CV with subsample=

In [128]:
params['subsample'] = 1.0
params['colsample_bytree'] = 0.7

## Learning Rate (ETA) tuning

In [130]:
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    train_mean_mae = cv_results['train-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds, train-mae-mean: {}".format(mean_mae, boost_rounds, train_mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with eta=0.3
	MAE 0.46930459999999996 for 104 rounds, train-mae-mean: 0.0250508
CV with eta=0.2
	MAE 0.476023 for 132 rounds, train-mae-mean: 0.0338902
CV with eta=0.1
	MAE 0.4726952 for 274 rounds, train-mae-mean: 0.0340418
CV with eta=0.05
	MAE 0.47683559999999997 for 281 rounds, train-mae-mean: 0.0968018
CV with eta=0.01
	MAE 0.4839716 for 481 rounds, train-mae-mean: 0.25610180000000005
CV with eta=0.005
	MAE 0.48716780000000004 for 499 rounds, train-mae-mean: 0.3469322


In [131]:
params['eta'] = 0.1

# Train and Predict

In [133]:
clf = xgb.XGBClassifier(max_depth=7, learning_rate=0.1, objective='binary:logistic', \
                      min_child_weight=1, subsample=1.0, colsample_bytree=0.7, n_estimators=200)

clf.fit(X_train, y_train, eval_metric=f1_eval)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [134]:
preds = clf.predict(test_doc2vec_features)
preds

array([1, 0, 0, ..., 1, 1, 0])

In [135]:
final_preds = pd.Series(preds)
test_ids = df_test['id']
df_preds = pd.concat([test_ids,final_preds],axis=1)
df_preds.columns = ['id', 'target']

In [136]:
df_preds.to_csv('XGBoost_submission.csv')
df_preds.tail()

Unnamed: 0,id,target
3258,10861,0
3259,10865,0
3260,10868,1
3261,10874,1
3262,10875,0
