In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from statistics import mean
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from skopt.space import Real, Integer
from skopt import BayesSearchCV

In [None]:
train = pd.read_csv(
    '../data/preprocessed_data.csv', 
    delimiter=',', 
    usecols=['comment', 'isSarcastic'])


In [None]:
x_train, x_BMA_val, y_train, y_BMA_val = train_test_split(
    train['comment'].astype(str), 
    train['isSarcastic'].astype(int), 
    test_size=0.33, 
    random_state=42, 
    shuffle=True, 
    stratify=train['isSarcastic'].astype(int))

In [None]:
count_vect = CountVectorizer(stop_words=None, lowercase=True, max_features=5000)
count_vect = count_vect.fit(x_train)
bow = count_vect.transform(x_train)

In [None]:
clf = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric = 'logloss'
    )

In [None]:
params = {
    'subsample': Real(0.1, 1, prior='log-uniform'),
    'max_depth': Integer(5, 25, prior='log-uniform'),
    'sampling_method': ['uniform', 'gradient_based'],
    'lambda': Real(0.001, 1, prior='log-uniform'),
    'alpha': Real(0.001, 1, prior='log-uniform'),
    'learning_rate': Real(0.001, 0.3, prior='log-uniform'),
    'colsample_bytree': Real(0.5, 1, prior='log-uniform'),
    'gamma': Real(0.5, 10, prior='log-uniform'),
    'n_estimators': Integer(50, 1000, prior='log-uniform'),
}

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
opt = BayesSearchCV(
    estimator=clf, 
    search_spaces=params, 
    n_iter=50, 
    cv=skf, 
    scoring='f1',
    random_state=42)

In [None]:
opt.fit(bow, y_train.values)

In [23]:
opt.best_score_

0.6554571228913858

In [20]:
# Manually refit the best classifier

clf = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric = 'logloss',
    reg_alpha = 0.0012908438331038965,
    colsample_bytree = 0.9134233454813501,
    gamma = 0.5601774433600644,
    reg_lambda =  0.009088968514074753,
    learning_rate = 0.236980563450212,
    max_depth = 18,
    n_estimators = 985,
    sampling_method = 'uniform',
    subsample = 0.3394318784339951
    )

In [21]:
clf.fit(bow, y_train.values)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9134233454813501,
              eval_metric='logloss', gamma=0.5601774433600644, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.236980563450212, max_delta_step=0, max_depth=18,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=985, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0.0012908438331038965, reg_lambda=0.009088968514074753,
              sampling_method='uniform', scale_pos_weight=1,
              subsample=0.3394318784339951, tree_method='gpu_hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [29]:
import pickle
pickle.dump(clf, open('../classifiers/xgboost.pickled', 'wb'))