In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
df = pd.read_csv('data/Tweets.csv')
X = df['text'].values
y = df['airline_sentiment']
y = y.map({'negative': -1, 'neutral': 0, 'positive': 1}).values

In [3]:
X.shape

(14640,)

In [4]:
cv = StratifiedKFold(n_splits=3, random_state=42)

In [5]:
# from tools.memorize_decorator import MemoDecorator as MD
# MD.clean_memorized()

# Testing baseline model (tokenize + clean + ExtraTrees)

In [6]:
from models.baseline_model import build_model as baseline_model

model = baseline_model()

In [7]:
%%time

scores = cross_val_score(model, X, y, cv=cv)

CPU times: user 7min 46s, sys: 3.08 s, total: 7min 49s
Wall time: 2min 10s


In [8]:
score = scores.mean()
std = scores.std()
print("Score: %.4f +- %.4f" % (score, std))

Score: 0.7472 +- 0.0069


# Testing word2vec (ndim=25) based approach

In [9]:
from models.word2vec_model import build_model as w2v_model

model = w2v_model(n_dim=25)

In [10]:
%%time

scores = cross_val_score(model, X, y, cv=cv)

CPU times: user 848 ms, sys: 132 ms, total: 980 ms
Wall time: 789 ms


In [11]:
score = scores.mean()
std = scores.std()
print("Score: %.4f +- %.4f" % (score, std))

Score: 0.7424 +- 0.0043


# Testing word2vec (ndim=200) based approach

In [12]:
model = w2v_model(n_dim=200)

In [13]:
%%time

scores = cross_val_score(model, X, y, cv=cv)

CPU times: user 948 ms, sys: 76 ms, total: 1.02 s
Wall time: 621 ms


In [14]:
score = scores.mean()
std = scores.std()
print("Score: %.4f +- %.4f" % (score, std))

Score: 0.7926 +- 0.0061


# Testing ensemble

In [15]:
from models.ensemble_model import build_model as ensemble_model

model = ensemble_model()

In [16]:
%%time

scores = cross_val_score(model, X, y, cv=cv)

CPU times: user 7min 14s, sys: 2.8 s, total: 7min 16s
Wall time: 2min 4s


In [17]:
score = scores.mean()
std = scores.std()
print("Score: %.4f +- %.4f" % (score, std))

Score: 0.7441 +- 0.0254
