In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction import stop_words
import numpy as np
from gensim.models import KeyedVectors

In [21]:
# load the google word2vec model
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [17]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)
print(result)

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.549946129322052), ('prince', 0.5377321243286133)]


In [36]:
result = model.most_similar(positive=['weekend'], negative=['fun'], topn=1)
print(result)

[('week', 0.5004594922065735)]


In [157]:
result = model.most_similar(positive=['Trump'],  topn=1)
print(result)

[('Donald_Trump', 0.8103919625282288)]


In [22]:
vec_length = model.word_vec('serfdom').shape

feature_vecs = []
labels = []
with open('data/Trec.train') as f:
    for line in f:
        query_vec = np.zeros(vec_length, dtype='float64')
        label, query = line.split()[0], line.split()[1:]
        labels.append(int(label))
        for word in query:
            if word in stop_words.ENGLISH_STOP_WORDS: # ['What', 'How', 'Where', 'Who', 'When', 'Which']:
                continue
            try:
#               summing all wordvecs to get queryvec
                query_vec += model.word_vec(word)
            except KeyError:
                pass
        feature_vecs.append(query_vec)

feature_vecs = np.asarray(feature_vecs)
labels = np.asarray(labels)
print(feature_vecs.shape)
print(labels.shape)

(5452, 300)
(5452,)


In [23]:
(train_features,
test_features,
train_labels,
test_labels) = train_test_split(feature_vecs,
                               labels,
                               test_size = 0.10,
                               random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (4906, 300)
Training Labels Shape: (4906,)
Testing Features Shape: (546, 300)
Testing Labels Shape: (546,)


# Random Forest = 67% accuracy

In [35]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs=-1)
# Train the model on training data
rf.fit(train_features, train_labels);
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

0.3315018315018315


# Gradient Boosted Regression Trees = 68%

In [37]:
gbrt = GradientBoostingClassifier(n_estimators=100, max_depth=3, criterion='mse')
gbrt.fit(train_features, train_labels)
predictions = gbrt.predict(test_features)
results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

0.31684981684981683


# SGD and Logistic Regression 75% accuracy

In [26]:
from sklearn.linear_model import SGDClassifier

In [38]:
sgd = SGDClassifier(loss='log',
                    penalty='l2',
                    alpha=0.001, l1_ratio=0.15,
                    fit_intercept=True,
                    max_iter=100,
                    tol=None,
                    shuffle=True,
                    verbose=0,
                    n_jobs=-1,
                    random_state=None,
                    learning_rate='optimal',
                    power_t=0.5,
                    warm_start=False,
                    average=False)
sgd.fit(train_features, train_labels)
predictions = sgd.predict(test_features)
results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

0.2564102564102564
