In [180]:
import pandas as pd

In [234]:
df = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')

In [182]:
df.drop(['keyword', 'location'], axis=1, inplace=True)
df_test.drop(['keyword', 'location'], axis=1, inplace=True)

In [183]:
df_x = df['text']
df_y = df['target']

df_test_x = df_test['text']

## Vamos a probar distintos word-embeddings

### 1. Doc2Vec

In [184]:
# gensim to pre process text
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [ 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [185]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=300, learning_rate=0.1, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [186]:
doc2vec_trf = Doc2VecTransformer()

doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)

test_doc2vec_features = doc2vec_trf.fit(df_test_x).transform(df_test_x)

100%|██████████| 7613/7613 [00:00<00:00, 4403093.82it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4028161.52it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4327312.15it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4575331.19it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4317949.47it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4254661.74it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4332596.52it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4278032.74it/s]
100%|██████████| 7613/7613 [00:00<00:00, 908686.29it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4315615.13it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4151766.53it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4625034.23it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4446008.96it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4037329.16it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4649277.28it/s]
100%|██████████| 7613/7613 [00:00<00:00, 997196.73it/s]
100%|██████████| 7613/7613 [00:00<00:00, 4445390.00it/s]
100%|██████████| 7613/7613 [00:00

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc2vec_features, df_y, test_size=.2, random_state=42)

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [126]:
from sklearn.metrics import f1_score
import numpy as np

def f1_eval_min(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_bin = [1 if y_cont > 0.5 else 0 for y_cont in y_pred]
    f1_score_min = 1 - f1_score(y_true, y_bin)
    return 'f1_score_min', f1_score_min

In [101]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.1,
    'subsample': 0.8,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'binary:logistic',
}

In [102]:
param_dist = {'objective':'binary:logistic', 'n_estimators':50}

In [103]:
from xgboost import XGBClassifier

clf = XGBClassifier(**param_dist)

clf.fit(X_train, y_train, eval_set=[(X_test, y_test)] , eval_metric=f1_eval_min)

[0]	validation_0-error:0.44780	validation_0-f1_score_min:0.77149
[1]	validation_0-error:0.45699	validation_0-f1_score_min:0.68369
[2]	validation_0-error:0.47209	validation_0-f1_score_min:0.70698
[3]	validation_0-error:0.46553	validation_0-f1_score_min:0.69036
[4]	validation_0-error:0.47209	validation_0-f1_score_min:0.71400
[5]	validation_0-error:0.47275	validation_0-f1_score_min:0.70866
[6]	validation_0-error:0.47275	validation_0-f1_score_min:0.70175
[7]	validation_0-error:0.46290	validation_0-f1_score_min:0.66825
[8]	validation_0-error:0.46290	validation_0-f1_score_min:0.66197
[9]	validation_0-error:0.47406	validation_0-f1_score_min:0.67100
[10]	validation_0-error:0.47078	validation_0-f1_score_min:0.66083
[11]	validation_0-error:0.47341	validation_0-f1_score_min:0.66574
[12]	validation_0-error:0.47800	validation_0-f1_score_min:0.67159
[13]	validation_0-error:0.48194	validation_0-f1_score_min:0.67588
[14]	validation_0-error:0.47406	validation_0-f1_score_min:0.65876
[15]	validation_0-er

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## 2. Final_train

In [211]:
df_final_train = pd.read_csv('final_train.csv')

In [212]:
df_final_train.drop(['keyword', 'location', 'text', 'country', 'city', 'words', 'real_words', 'clean_text', 'punctuation_signs', 'hashtags', 'mentions', 'urls'], axis=1, inplace=True)

In [213]:
df_final_train.drop(['lemma_text', 'porter_stemmed_text', 'snowball_stemmed_text'], axis=1, inplace=True)

In [214]:
def getTrainSet():
    return df_final_train.loc[df_final_train['source'] == 'train']

def getTestSet():
    return df_final_train.loc[df_final_train['source'] == 'test']

In [199]:
type(doc2vec_features)

numpy.matrix

In [208]:
dfaaa = pd.DataFrame(doc2vec_features).reset_index()
dfaaa

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0,-0.000914,0.001169,-0.000346,0.000152,-0.000759,0.001435,0.000778,-0.000480,-0.001063,...,0.000956,0.000027,0.000489,0.000744,0.000227,0.001487,-0.000271,0.001281,-3.336935e-04,0.000022
1,1,-0.000749,0.000456,0.000680,0.000848,0.000872,-0.000065,-0.001191,0.001173,0.000031,...,0.000979,0.000518,-0.000211,0.000873,-0.001455,-0.000227,-0.000516,0.000884,1.073842e-03,0.000725
2,2,-0.001660,-0.000524,-0.001411,-0.000174,0.001364,-0.000190,0.000795,-0.001603,-0.000932,...,0.000940,0.001585,-0.000529,0.001640,-0.000402,0.000018,-0.000404,-0.000690,-1.952253e-04,-0.000098
3,3,-0.000087,-0.000108,-0.000470,0.001614,-0.000261,0.001095,-0.001555,0.000959,-0.000088,...,0.000961,-0.000535,0.001282,-0.000450,-0.000194,-0.000139,0.001590,-0.001265,-3.894979e-07,-0.001015
4,4,-0.001274,-0.001448,-0.000813,-0.001231,-0.001224,0.000472,-0.001259,-0.000780,-0.000790,...,0.000322,0.000684,0.000712,0.001153,0.001323,0.001048,-0.001626,-0.000601,4.613861e-04,0.000640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,7608,0.000991,0.000747,-0.001545,-0.001012,0.000559,0.001376,-0.000005,0.000895,-0.000939,...,0.000577,0.000265,-0.001232,0.001085,0.000474,0.000028,0.001635,-0.001577,2.386603e-04,0.000092
7609,7609,0.000806,-0.001009,-0.000908,-0.000332,-0.001037,0.000054,0.000621,-0.000835,-0.000904,...,-0.000674,0.001515,0.001086,-0.001208,-0.000289,-0.001187,-0.001169,0.000668,-1.533359e-03,0.000232
7610,7610,0.001191,-0.001628,0.000886,-0.001358,0.000524,-0.000918,-0.000807,0.001065,0.000148,...,0.001201,-0.001399,0.001196,-0.001363,-0.000858,-0.000529,-0.000612,0.000412,9.229602e-04,0.000786
7611,7611,0.000815,-0.000693,-0.000773,-0.000458,0.001138,-0.000685,-0.000176,-0.000235,-0.000326,...,-0.001317,-0.001101,0.000050,-0.001117,-0.001338,0.000814,0.000417,-0.000947,6.898775e-04,-0.001651


In [217]:
result = pd.concat([df, dfaaa], axis=1, sort=False)
result.drop(['keyword', 'location', 'text', 'target', 'index'], axis=1, inplace=True)
result

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,1,-0.000914,0.001169,-0.000346,0.000152,-0.000759,0.001435,0.000778,-0.000480,-0.001063,...,0.000956,0.000027,0.000489,0.000744,0.000227,0.001487,-0.000271,0.001281,-3.336935e-04,0.000022
1,4,-0.000749,0.000456,0.000680,0.000848,0.000872,-0.000065,-0.001191,0.001173,0.000031,...,0.000979,0.000518,-0.000211,0.000873,-0.001455,-0.000227,-0.000516,0.000884,1.073842e-03,0.000725
2,5,-0.001660,-0.000524,-0.001411,-0.000174,0.001364,-0.000190,0.000795,-0.001603,-0.000932,...,0.000940,0.001585,-0.000529,0.001640,-0.000402,0.000018,-0.000404,-0.000690,-1.952253e-04,-0.000098
3,6,-0.000087,-0.000108,-0.000470,0.001614,-0.000261,0.001095,-0.001555,0.000959,-0.000088,...,0.000961,-0.000535,0.001282,-0.000450,-0.000194,-0.000139,0.001590,-0.001265,-3.894979e-07,-0.001015
4,7,-0.001274,-0.001448,-0.000813,-0.001231,-0.001224,0.000472,-0.001259,-0.000780,-0.000790,...,0.000322,0.000684,0.000712,0.001153,0.001323,0.001048,-0.001626,-0.000601,4.613861e-04,0.000640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.000991,0.000747,-0.001545,-0.001012,0.000559,0.001376,-0.000005,0.000895,-0.000939,...,0.000577,0.000265,-0.001232,0.001085,0.000474,0.000028,0.001635,-0.001577,2.386603e-04,0.000092
7609,10870,0.000806,-0.001009,-0.000908,-0.000332,-0.001037,0.000054,0.000621,-0.000835,-0.000904,...,-0.000674,0.001515,0.001086,-0.001208,-0.000289,-0.001187,-0.001169,0.000668,-1.533359e-03,0.000232
7610,10871,0.001191,-0.001628,0.000886,-0.001358,0.000524,-0.000918,-0.000807,0.001065,0.000148,...,0.001201,-0.001399,0.001196,-0.001363,-0.000858,-0.000529,-0.000612,0.000412,9.229602e-04,0.000786
7611,10872,0.000815,-0.000693,-0.000773,-0.000458,0.001138,-0.000685,-0.000176,-0.000235,-0.000326,...,-0.001317,-0.001101,0.000050,-0.001117,-0.001338,0.000814,0.000417,-0.000947,6.898775e-04,-0.001651


In [218]:
df_train = getTrainSet()
df_train

Unnamed: 0,id,target,lat,lon,source,entities_count,words_count,punctuations_signs_count,hashtags_count,mentions_count,urls_count,stopwords_count,words_length_avg,punctuations_ratio,hashtags_ratio,mentions_ratio,urls_ratio,stopwords_ratio,special_entities_ratio,keyword_cv_mean_enc
0,1,1.0,,,train,13,6,0,1,0,0,6,4.666667,0.000000,0.076923,0.000000,0.000000,0.461538,0.538462,0.660714
1,4,1.0,,,train,8,7,1,0,0,0,0,4.428571,0.125000,0.000000,0.000000,0.000000,0.000000,0.125000,0.660714
2,5,1.0,,,train,25,11,3,0,0,0,11,7.090909,0.120000,0.000000,0.000000,0.000000,0.440000,0.560000,0.660714
3,6,1.0,,,train,8,5,1,1,0,0,1,7.800000,0.125000,0.125000,0.000000,0.000000,0.125000,0.375000,0.660714
4,7,1.0,,,train,16,7,0,2,0,0,7,4.571429,0.000000,0.125000,0.000000,0.000000,0.437500,0.562500,0.660714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7498,10869,1.0,,,train,11,8,0,0,0,1,2,5.750000,0.000000,0.000000,0.000000,0.090909,0.181818,0.272727,0.660714
7499,10870,1.0,,,train,22,9,2,0,2,0,9,6.222222,0.090909,0.000000,0.090909,0.000000,0.409091,0.590909,0.870968
7500,10871,1.0,,,train,15,5,7,0,0,1,2,4.200000,0.466667,0.000000,0.000000,0.066667,0.133333,0.666667,0.666667
7501,10872,1.0,43.653226,-79.383184,train,21,14,2,0,0,0,5,7.357143,0.095238,0.000000,0.000000,0.000000,0.238095,0.333333,0.564103


In [220]:
df_train = df_train.merge(result, how='inner', left_on='id', right_on='id')
df_train

Unnamed: 0,id,target,lat,lon,source,entities_count,words_count,punctuations_signs_count,hashtags_count,mentions_count,...,290,291,292,293,294,295,296,297,298,299
0,1,1.0,,,train,13,6,0,1,0,...,0.000956,0.000027,0.000489,0.000744,0.000227,0.001487,-0.000271,0.001281,-3.336935e-04,0.000022
1,4,1.0,,,train,8,7,1,0,0,...,0.000979,0.000518,-0.000211,0.000873,-0.001455,-0.000227,-0.000516,0.000884,1.073842e-03,0.000725
2,5,1.0,,,train,25,11,3,0,0,...,0.000940,0.001585,-0.000529,0.001640,-0.000402,0.000018,-0.000404,-0.000690,-1.952253e-04,-0.000098
3,6,1.0,,,train,8,5,1,1,0,...,0.000961,-0.000535,0.001282,-0.000450,-0.000194,-0.000139,0.001590,-0.001265,-3.894979e-07,-0.001015
4,7,1.0,,,train,16,7,0,2,0,...,0.000322,0.000684,0.000712,0.001153,0.001323,0.001048,-0.001626,-0.000601,4.613861e-04,0.000640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7498,10869,1.0,,,train,11,8,0,0,0,...,0.000577,0.000265,-0.001232,0.001085,0.000474,0.000028,0.001635,-0.001577,2.386603e-04,0.000092
7499,10870,1.0,,,train,22,9,2,0,2,...,-0.000674,0.001515,0.001086,-0.001208,-0.000289,-0.001187,-0.001169,0.000668,-1.533359e-03,0.000232
7500,10871,1.0,,,train,15,5,7,0,0,...,0.001201,-0.001399,0.001196,-0.001363,-0.000858,-0.000529,-0.000612,0.000412,9.229602e-04,0.000786
7501,10872,1.0,43.653226,-79.383184,train,21,14,2,0,0,...,-0.001317,-0.001101,0.000050,-0.001117,-0.001338,0.000814,0.000417,-0.000947,6.898775e-04,-0.001651


In [221]:
from sklearn.model_selection import train_test_split

df_train.drop(['source'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:, df_train.columns != 'target'], df_train.loc[:, df_train.columns == 'target'], test_size=.25, random_state=42)

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [222]:
from xgboost import XGBClassifier

clf = XGBClassifier(**param_dist)

clf.fit(X_train, y_train, eval_set=[(X_test, y_test)] , eval_metric=f1_eval_min)

[0]	validation_0-error:0.28571	validation_0-f1_score_min:0.37274
[1]	validation_0-error:0.28038	validation_0-f1_score_min:0.33805
[2]	validation_0-error:0.27239	validation_0-f1_score_min:0.33729


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[3]	validation_0-error:0.27026	validation_0-f1_score_min:0.32901
[4]	validation_0-error:0.26333	validation_0-f1_score_min:0.32120
[5]	validation_0-error:0.26546	validation_0-f1_score_min:0.32422
[6]	validation_0-error:0.26439	validation_0-f1_score_min:0.32292
[7]	validation_0-error:0.26706	validation_0-f1_score_min:0.32724
[8]	validation_0-error:0.27292	validation_0-f1_score_min:0.33508
[9]	validation_0-error:0.27079	validation_0-f1_score_min:0.33159
[10]	validation_0-error:0.27079	validation_0-f1_score_min:0.33073
[11]	validation_0-error:0.26812	validation_0-f1_score_min:0.32940
[12]	validation_0-error:0.26706	validation_0-f1_score_min:0.33069
[13]	validation_0-error:0.26653	validation_0-f1_score_min:0.32895
[14]	validation_0-error:0.27079	validation_0-f1_score_min:0.33377
[15]	validation_0-error:0.27239	validation_0-f1_score_min:0.33641
[16]	validation_0-error:0.27399	validation_0-f1_score_min:0.33816
[17]	validation_0-error:0.27612	validation_0-f1_score_min:0.34169
[18]	validation_0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# HP Tuning with Grid-Search and K-fold CV

## Tune Complexity of decision trees

In [223]:
#tune max_depth and min_child_weight (complexity of decision trees)
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,7)
    for min_child_weight in range(1,4)
]

# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=3, min_child_weight=1
	MAE 0.3305166 for 249 rounds
CV with max_depth=3, min_child_weight=2
	MAE 0.33326039999999996 for 140 rounds
CV with max_depth=3, min_child_weight=3
	MAE 0.33350739999999995 for 121 rounds
CV with max_depth=4, min_child_weight=1
	MAE 0.30073159999999993 for 499 rounds
CV with max_depth=4, min_child_weight=2
	MAE 0.3027032 for 499 rounds
CV with max_depth=4, min_child_weight=3
	MAE 0.3020944 for 499 rounds
CV with max_depth=5, min_child_weight=1
	MAE 0.285903 for 498 rounds
CV with max_depth=5, min_child_weight=2
	MAE 0.2905548 for 497 rounds
CV with max_depth=5, min_child_weight=3
	MAE 0.2934714 for 499 rounds
CV with max_depth=6, min_child_weight=1
	MAE 0.27940519999999996 for 498 rounds
CV with max_depth=6, min_child_weight=2
	MAE 0.2850266 for 499 rounds
CV with max_depth=6, min_child_weight=3


KeyboardInterrupt: 

In [224]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(7,11)
    for min_child_weight in range(1,2)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

CV with max_depth=7, min_child_weight=1
	MAE 0.2770846 for 498 rounds
CV with max_depth=8, min_child_weight=1
	MAE 0.275478 for 465 rounds
CV with max_depth=9, min_child_weight=1
	MAE 0.2770654 for 368 rounds
CV with max_depth=10, min_child_weight=1
	MAE 0.27768060000000006 for 365 rounds


In [225]:
params['max_depth'] = 8
params['min_child_weight'] = 1

## Tune Sampling HP

In [226]:
#tune sampling h-p
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with subsample=1.0, colsample=1.0
	MAE 0.2700538 for 496 rounds
CV with subsample=1.0, colsample=0.9
	MAE 0.2719636 for 499 rounds
CV with subsample=1.0, colsample=0.8
	MAE 0.2720178 for 499 rounds
CV with subsample=1.0, colsample=0.7
	MAE 0.275503 for 461 rounds
CV with subsample=0.9, colsample=1.0
	MAE 0.2729242 for 499 rounds
CV with subsample=0.9, colsample=0.9
	MAE 0.27544399999999997 for 394 rounds
CV with subsample=0.9, colsample=0.8
	MAE 0.274798 for 429 rounds
CV with subsample=0.9, colsample=0.7
	MAE 0.2754156 for 496 rounds
CV with subsample=0.8, colsample=1.0
	MAE 0.275478 for 465 rounds
CV with subsample=0.8, colsample=0.9
	MAE 0.274601 for 498 rounds
CV with subsample=0.8, colsample=0.8
	MAE 0.2758868 for 498 rounds
CV with subsample=0.8, colsample=0.7
	MAE 0.2766946 for 496 rounds
CV with subsample=0.7, colsample=1.0
	MAE 0.2783328 for 455 rounds
CV with subsample=0.7, colsample=0.9
	MAE 0.278574 for 496 rounds
CV with subsample=0.7, colsample=0.8


KeyboardInterrupt: 

In [227]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

## Learning Rate (ETA) tuning

In [228]:
for eta in [.2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        seed=123,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

CV with eta=0.2
	MAE 0.2711576 for 464 rounds
CV with eta=0.1
	MAE 0.2700538 for 496 rounds
CV with eta=0.05
	MAE 0.274718 for 499 rounds
CV with eta=0.01
	MAE 0.307592 for 499 rounds
CV with eta=0.005


KeyboardInterrupt: 

In [229]:
params['eta'] = 0.1

# Train and Predict

In [230]:
clf = xgb.XGBClassifier(max_depth=8, learning_rate=0.1, objective='binary:logistic', \
                      min_child_weight=1, subsample=1.0, colsample_bytree=1.0, n_estimators=200)

clf.fit(X_train, y_train, eval_metric=f1_eval_min)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [235]:
df_test_set = getTestSet()
df_test_set.drop(['target', 'source'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [239]:
dfaaa = pd.DataFrame(test_doc2vec_features).reset_index()
dfaaa

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0,-0.000508,-0.001547,0.000023,0.000115,-0.001436,0.001582,-0.001281,-0.001282,0.000467,...,0.000436,0.001537,0.001616,0.000985,-0.001099,-0.001416,0.000362,-0.000600,-0.000645,-0.001363
1,1,0.000034,-0.001429,0.001644,-0.001089,-0.000446,-0.000713,0.000036,-0.000074,0.001270,...,0.001334,-0.001355,-0.000359,0.000869,-0.001422,0.000062,0.001136,0.000723,0.001443,0.001222
2,2,-0.000785,0.001624,0.000875,0.000820,0.001232,-0.000087,0.000317,0.000416,0.001646,...,0.000038,0.000531,0.000232,-0.000633,-0.001132,0.000060,0.000839,0.000755,-0.000209,-0.000568
3,3,0.000860,0.001097,0.001275,-0.000057,-0.000134,-0.000732,0.000846,-0.001167,0.001503,...,0.001599,-0.001115,0.001104,-0.001313,-0.000554,0.000195,-0.001488,-0.000922,0.001167,0.001524
4,4,0.000301,-0.001625,-0.001291,0.000160,-0.000022,-0.000529,-0.000687,0.000351,-0.000338,...,0.001448,-0.001580,-0.001168,0.000445,-0.001162,0.000587,-0.000661,0.000897,0.000745,0.000067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,3258,-0.001225,-0.001094,-0.001546,-0.000491,-0.001041,0.001548,-0.000162,-0.001438,0.001343,...,-0.000679,-0.000037,-0.001626,0.000663,0.001517,0.000893,-0.001542,-0.000949,0.000119,-0.001199
3259,3259,0.000194,0.000845,0.001121,0.000714,-0.001010,-0.001005,0.000179,0.000148,0.000209,...,-0.000005,-0.001554,0.000794,-0.000291,0.000833,0.001630,-0.001104,0.000588,-0.001128,0.001572
3260,3260,0.001346,0.001588,0.001393,-0.000639,-0.001057,-0.001297,0.000541,-0.001374,-0.000931,...,-0.001017,-0.001628,0.000511,-0.000101,-0.001523,0.000435,0.000946,0.001216,0.000091,-0.001405
3261,3261,0.001659,0.001420,-0.001499,-0.000944,0.000073,0.000835,0.001446,0.000198,0.000474,...,0.000858,0.000814,0.001631,-0.000366,-0.000804,0.001083,0.000717,-0.000895,0.000271,0.000134


In [241]:
result = pd.concat([df_test, dfaaa], axis=1, sort=False)
result.drop(['keyword', 'location', 'text', 'index'], axis=1, inplace=True)
result

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0,-0.000508,-0.001547,0.000023,0.000115,-0.001436,0.001582,-0.001281,-0.001282,0.000467,...,0.000436,0.001537,0.001616,0.000985,-0.001099,-0.001416,0.000362,-0.000600,-0.000645,-0.001363
1,2,0.000034,-0.001429,0.001644,-0.001089,-0.000446,-0.000713,0.000036,-0.000074,0.001270,...,0.001334,-0.001355,-0.000359,0.000869,-0.001422,0.000062,0.001136,0.000723,0.001443,0.001222
2,3,-0.000785,0.001624,0.000875,0.000820,0.001232,-0.000087,0.000317,0.000416,0.001646,...,0.000038,0.000531,0.000232,-0.000633,-0.001132,0.000060,0.000839,0.000755,-0.000209,-0.000568
3,9,0.000860,0.001097,0.001275,-0.000057,-0.000134,-0.000732,0.000846,-0.001167,0.001503,...,0.001599,-0.001115,0.001104,-0.001313,-0.000554,0.000195,-0.001488,-0.000922,0.001167,0.001524
4,11,0.000301,-0.001625,-0.001291,0.000160,-0.000022,-0.000529,-0.000687,0.000351,-0.000338,...,0.001448,-0.001580,-0.001168,0.000445,-0.001162,0.000587,-0.000661,0.000897,0.000745,0.000067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,-0.001225,-0.001094,-0.001546,-0.000491,-0.001041,0.001548,-0.000162,-0.001438,0.001343,...,-0.000679,-0.000037,-0.001626,0.000663,0.001517,0.000893,-0.001542,-0.000949,0.000119,-0.001199
3259,10865,0.000194,0.000845,0.001121,0.000714,-0.001010,-0.001005,0.000179,0.000148,0.000209,...,-0.000005,-0.001554,0.000794,-0.000291,0.000833,0.001630,-0.001104,0.000588,-0.001128,0.001572
3260,10868,0.001346,0.001588,0.001393,-0.000639,-0.001057,-0.001297,0.000541,-0.001374,-0.000931,...,-0.001017,-0.001628,0.000511,-0.000101,-0.001523,0.000435,0.000946,0.001216,0.000091,-0.001405
3261,10874,0.001659,0.001420,-0.001499,-0.000944,0.000073,0.000835,0.001446,0.000198,0.000474,...,0.000858,0.000814,0.001631,-0.000366,-0.000804,0.001083,0.000717,-0.000895,0.000271,0.000134


In [242]:
df_test_set = df_test_set.merge(result, how='inner', left_on='id', right_on='id')
df_test_set

Unnamed: 0,id,lat,lon,entities_count,words_count,punctuations_signs_count,hashtags_count,mentions_count,urls_count,stopwords_count,...,290,291,292,293,294,295,296,297,298,299
0,0,,,6,4,0,0,0,0,2,...,0.000436,0.001537,0.001616,0.000985,-0.001099,-0.001416,0.000362,-0.000600,-0.000645,-0.001363
1,2,,,11,6,2,1,0,0,2,...,0.001334,-0.001355,-0.000359,0.000869,-0.001422,0.000062,0.001136,0.000723,0.001443,0.001222
2,3,,,21,10,2,0,0,0,9,...,0.000038,0.000531,0.000232,-0.000633,-0.001132,0.000060,0.000839,0.000755,-0.000209,-0.000568
3,9,,,5,2,1,2,0,0,0,...,0.001599,-0.001115,0.001104,-0.001313,-0.000554,0.000195,-0.001488,-0.000922,0.001167,0.001524
4,11,,,8,5,1,0,0,0,2,...,0.001448,-0.001580,-0.001168,0.000445,-0.001162,0.000587,-0.000661,0.000897,0.000745,0.000067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,9,7,2,0,0,0,0,...,-0.000679,-0.000037,-0.001626,0.000663,0.001517,0.000893,-0.001542,-0.000949,0.000119,-0.001199
3259,10865,,,28,18,4,0,0,0,6,...,-0.000005,-0.001554,0.000794,-0.000291,0.000833,0.001630,-0.001104,0.000588,-0.001128,0.001572
3260,10868,,,6,4,0,0,0,1,1,...,-0.001017,-0.001628,0.000511,-0.000101,-0.001523,0.000435,0.000946,0.001216,0.000091,-0.001405
3261,10874,,,9,6,2,0,0,1,0,...,0.000858,0.000814,0.001631,-0.000366,-0.000804,0.001083,0.000717,-0.000895,0.000271,0.000134


In [244]:
preds = clf.predict(df_test_set)
preds

array([1., 1., 1., ..., 1., 1., 1.])

In [245]:
final_preds = pd.Series(preds)

In [246]:
test_ids = df_test_set['id']
test_ids = test_ids.reset_index()
test_ids.drop(['index'], axis=1, inplace=True)

In [247]:
final_preds = pd.Series(preds)
df_preds = pd.concat([test_ids,final_preds],axis=1)
df_preds.columns = ['id', 'target']

In [248]:
df_preds.target = df_preds.target.astype(int)

In [249]:
df_preds.to_csv('XGBoost_submission.csv')
df_preds.tail()

Unnamed: 0,id,target
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
3262,10875,1
