In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from statistics import mean
from sklearn.metrics import f1_score
from thundersvm import SVC
from skopt.space import Real, Integer
from skopt import BayesSearchCV

In [None]:
train = pd.read_csv(
    '../data/preprocessed_data.csv', 
    delimiter=',', 
    usecols=['comment', 'isSarcastic'])


In [None]:
x_train, x_BMA_val, y_train, y_BMA_val = train_test_split(
    train['comment'].astype(str), 
    train['isSarcastic'].astype(int), 
    test_size=0.33, 
    random_state=42, 
    shuffle=True, 
    stratify=train['isSarcastic'].astype(int))

In [None]:
# It is possible to specify different parameters, e.g. stopwords, lowercase
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words=None, lowercase=True, max_features=5000)
count_vect = count_vect.fit(x_train)
bow = count_vect.transform(x_train)

In [None]:
clf = SVC(verbose=1, kernel='rbf', probability=True, random_state=42, cache_size=1000)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Search with all kernels takes to long.  
**rbf** perform generally better than other kernels (if using large C value, ie. C = 10).

In [None]:
params = {
    'C': Real(0.1, 100, prior='log-uniform')
}

In [None]:
opt = BayesSearchCV(
    estimator=clf, 
    search_spaces=params, 
    n_iter=20, 
    cv=skf, 
    scoring='f1',
    verbose=3,
    random_state=42)

In [None]:
opt.fit(bow, y_train.values)