In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from skopt.space import Real, Integer
from skopt import BayesSearchCV

In [2]:
train = pd.read_csv(
    '../data/spacy_pre_data.csv', 
    delimiter=',', 
    usecols=['comment', 'isSarcastic'])


In [3]:
train = train.loc[
    (train["comment"].astype(str).apply(lambda x: x.split(' ')).apply(len) >= 2) 
    & (train["comment"].astype(str).apply(lambda x: x.split(' ')).apply(len) <= 50)
]

In [4]:
x_train, x_BMA_val, y_train, y_BMA_val = train_test_split(
    train['comment'].astype(str), 
    train['isSarcastic'].astype(int), 
    test_size=0.33, 
    random_state=42, 
    shuffle=True, 
    stratify=train['isSarcastic'].astype(int))

In [5]:
# It is possible to specify different parameters, e.g. stopwords, lowercase
from sklearn.feature_extraction.text import TfidfVectorizer

count_vect = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,3),
    min_df=5)
count_vect = count_vect.fit(x_train)
bow = count_vect.transform(x_train)

In [6]:
clf = MultinomialNB()

In [7]:
params = {
    'alpha': Real(1, 10000, prior='log-uniform')
}

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
opt = BayesSearchCV(
    estimator=clf, 
    search_spaces=params, 
    n_iter=50, 
    cv=skf, 
    scoring='f1',
    random_state=42,
    refit=True)

In [10]:
opt.fit(bow, y_train.values)

BayesSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
              estimator=MultinomialNB(), random_state=42, scoring='f1',
              search_spaces={'alpha': Real(low=1, high=10000, prior='log-uniform', transform='identity')})

In [11]:
opt.best_score_

0.6935497194979117

In [12]:
import pickle
pickle.dump(opt.best_estimator_, open('../classifiers/naive_bayes.pickled', 'wb'))