In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction import stop_words
import numpy as np
from gensim.models import KeyedVectors

In [3]:
# load the google word2vec model
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [17]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)
print(result)

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.549946129322052), ('prince', 0.5377321243286133)]


In [36]:
result = model.most_similar(positive=['weekend'], negative=['fun'], topn=1)
print(result)

[('week', 0.5004594922065735)]


In [8]:
result = model.most_similar(positive=['Trump'],  topn=1)
print(result)

[('Donald_Trump', 0.8103919625282288)]


In [6]:
vec_length = model.word_vec('serfdom').shape

feature_vecs = []
labels = []
with open('data/TREC.train') as f:
    for line in f:
        query_vec = np.zeros(vec_length, dtype='float64')
        label, query = line.split()[0], line.split()[1:]
        labels.append(int(label))
        for word in query:
            if word in stop_words.ENGLISH_STOP_WORDS: # ['What', 'How', 'Where', 'Who', 'When', 'Which']:
                continue
            try:
#               summing all wordvecs to get queryvec
                query_vec += model.word_vec(word)
            except KeyError:
                pass
        feature_vecs.append(query_vec)

with open('data/TREC.test') as f:
    for line in f:
        query_vec = np.zeros(vec_length, dtype='float64')
        label, query = line.split()[0], line.split()[1:]
        labels.append(int(label))
        for word in query:
            if word in stop_words.ENGLISH_STOP_WORDS: # ['What', 'How', 'Where', 'Who', 'When', 'Which']:
                continue
            try:
#               summing all wordvecs to get queryvec
                query_vec += model.word_vec(word)
            except KeyError:
                pass
        feature_vecs.append(query_vec)

feature_vecs = np.asarray(feature_vecs)
labels = np.asarray(labels)
print(feature_vecs.shape)
print(labels.shape)

(5952, 300)
(5952,)


In [7]:
(train_features,
test_features,
train_labels,
test_labels) = train_test_split(feature_vecs,
                               labels,
                               test_size = 0.10,
                               random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (5356, 300)
Training Labels Shape: (5356,)
Testing Features Shape: (596, 300)
Testing Labels Shape: (596,)


# Random Forest = 71% accuracy

In [8]:
# Instantiate model with n decision trees
for n in [1,10,100, 500]:
    rf = RandomForestClassifier(n_estimators = n, random_state = 42, n_jobs=-1)
    # Train the model on training data
    rf.fit(train_features, train_labels);
    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)
    results = test_labels - predictions
    error = results[results != 0].size/results.size
    print('Error with n = {} trees: {}'.format(n,error))

Error with n = 1 trees: 0.6073825503355704
Error with n = 10 trees: 0.4714765100671141
Error with n = 100 trees: 0.29194630872483224
Error with n = 500 trees: 0.28859060402684567


# Gradient Boosted Regression Trees = 78%

In [39]:
for n in [1,10,100, 500]:
        gbrt = GradientBoostingClassifier(n_estimators=n, max_depth=3, criterion='mse')
        gbrt.fit(train_features, train_labels)
        predictions = gbrt.predict(test_features)
        results = test_labels - predictions
        error = results[results != 0].size/results.size
        print('Error with n = {} trees: {}'.format(n,error))

Error with n = 1 trees: 0.5251677852348994
Error with n = 10 trees: 0.436241610738255
Error with n = 100 trees: 0.26677852348993286
Error with n = 500 trees: 0.2214765100671141


# Linear Classifiers with SGD 79% accuracy

## SVM, Logistic Regression, Quadratic SVM, Perceptron

### L2 = MSE, L1 = Absolute Value of errors, ElasticNet = Combination

## Testing all possible combinations of parameters

In [18]:
for alg in ['hinge', 'log', 'squared_hinge', 'perceptron']:
    for penalty in ['none', 'l2', 'l1', 'elasticnet']:
        for alpha in [1.0, 0.1, 0.01, 0.001]:
            for n in [1, 10, 100, 500, 1000]:
                sgd = SGDClassifier(loss=alg,
                                    penalty=penalty,
                                    alpha=alpha, l1_ratio=0.15,
                                    fit_intercept=True,
                                    max_iter=n,
                                    tol=None,
                                    shuffle=True,
                                    verbose=0,
                                    n_jobs=-1,
                                    random_state=70,
                                    learning_rate='optimal',
                                    power_t=0.5,
                                    warm_start=False,
                                    average=False)
                sgd.fit(train_features, train_labels)
                predictions = sgd.predict(test_features)
                results = test_labels - predictions
                error = results[results != 0].size/results.size
                print('Error using {} with {} penalty, learning rate {} and n = {} epochs: {}'.format(alg, penalty, alpha, n, error))

Error using hinge with none penality, learning rate 1.0 and n = 1 epochs: 0.49328859060402686
Error using hinge with none penality, learning rate 1.0 and n = 10 epochs: 0.47315436241610737
Error using hinge with none penality, learning rate 1.0 and n = 100 epochs: 0.44798657718120805
Error using hinge with none penality, learning rate 1.0 and n = 500 epochs: 0.41946308724832215
Error using hinge with none penality, learning rate 1.0 and n = 1000 epochs: 0.41946308724832215
Error using hinge with none penality, learning rate 0.1 and n = 1 epochs: 0.40939597315436244
Error using hinge with none penality, learning rate 0.1 and n = 10 epochs: 0.36577181208053694
Error using hinge with none penality, learning rate 0.1 and n = 100 epochs: 0.3271812080536913
Error using hinge with none penality, learning rate 0.1 and n = 500 epochs: 0.3187919463087248
Error using hinge with none penality, learning rate 0.1 and n = 1000 epochs: 0.3187919463087248
Error using hinge with none penality, learning 

Error using log with none penality, learning rate 0.1 and n = 10 epochs: 0.3808724832214765
Error using log with none penality, learning rate 0.1 and n = 100 epochs: 0.34731543624161076
Error using log with none penality, learning rate 0.1 and n = 500 epochs: 0.33053691275167785
Error using log with none penality, learning rate 0.1 and n = 1000 epochs: 0.3271812080536913
Error using log with none penality, learning rate 0.01 and n = 1 epochs: 0.3640939597315436
Error using log with none penality, learning rate 0.01 and n = 10 epochs: 0.32046979865771813
Error using log with none penality, learning rate 0.01 and n = 100 epochs: 0.30033557046979864
Error using log with none penality, learning rate 0.01 and n = 500 epochs: 0.29194630872483224
Error using log with none penality, learning rate 0.01 and n = 1000 epochs: 0.28691275167785235
Error using log with none penality, learning rate 0.001 and n = 1 epochs: 0.2986577181208054
Error using log with none penality, learning rate 0.001 and n

Error using squared_hinge with none penality, learning rate 0.01 and n = 100 epochs: 0.3053691275167785
Error using squared_hinge with none penality, learning rate 0.01 and n = 500 epochs: 0.2986577181208054
Error using squared_hinge with none penality, learning rate 0.01 and n = 1000 epochs: 0.30033557046979864
Error using squared_hinge with none penality, learning rate 0.001 and n = 1 epochs: 0.28187919463087246
Error using squared_hinge with none penality, learning rate 0.001 and n = 10 epochs: 0.2516778523489933
Error using squared_hinge with none penality, learning rate 0.001 and n = 100 epochs: 0.27348993288590606
Error using squared_hinge with none penality, learning rate 0.001 and n = 500 epochs: 0.337248322147651
Error using squared_hinge with none penality, learning rate 0.001 and n = 1000 epochs: 0.34060402684563756
Error using squared_hinge with l2 penality, learning rate 1.0 and n = 1 epochs: 0.7298657718120806
Error using squared_hinge with l2 penality, learning rate 1.0 

Error using perceptron with none penality, learning rate 0.01 and n = 100 epochs: 0.3070469798657718
Error using perceptron with none penality, learning rate 0.01 and n = 500 epochs: 0.3053691275167785
Error using perceptron with none penality, learning rate 0.01 and n = 1000 epochs: 0.30033557046979864
Error using perceptron with none penality, learning rate 0.001 and n = 1 epochs: 0.2986577181208054
Error using perceptron with none penality, learning rate 0.001 and n = 10 epochs: 0.2634228187919463
Error using perceptron with none penality, learning rate 0.001 and n = 100 epochs: 0.26677852348993286
Error using perceptron with none penality, learning rate 0.001 and n = 500 epochs: 0.28859060402684567
Error using perceptron with none penality, learning rate 0.001 and n = 1000 epochs: 0.32550335570469796
Error using perceptron with l2 penality, learning rate 1.0 and n = 1 epochs: 0.5067114093959731
Error using perceptron with l2 penality, learning rate 1.0 and n = 10 epochs: 0.49161073

## Test differing learning rates

In [26]:
for alg in ['hinge', 'log', 'squared_hinge', 'perceptron']:
    for penalty in ['l2', 'l1', 'elasticnet']:
            for n in [100, 500, 1000]:
                for alpha in [.01, .001]:
                    sgd = SGDClassifier(loss=alg,
                                        penalty=penalty,
                                        alpha=alpha, l1_ratio=0.15,
                                        fit_intercept=True,
                                        max_iter=n,
                                        tol=None,
                                        shuffle=True,
                                        verbose=0,
                                        n_jobs=-1,
                                        random_state=70,
                                        learning_rate='optimal',
                                        power_t=0.5,
                                        warm_start=False,
                                        average=False)
                    sgd.fit(train_features, train_labels)
                    predictions = sgd.predict(test_features)
                    results = test_labels - predictions
                    error = results[results != 0].size/results.size
                    print('Error using {} with {} penalty, learning rate {} and n = {} epochs: {}'.format(alg, penalty, alpha, n, error))

Error using hinge with l2 penalty, learning rate 0.01 and n = 100 epochs: 0.2197986577181208
Error using hinge with l2 penalty, learning rate 0.001 and n = 100 epochs: 0.22483221476510068
Error using hinge with l2 penalty, learning rate 0.01 and n = 500 epochs: 0.2181208053691275
Error using hinge with l2 penalty, learning rate 0.001 and n = 500 epochs: 0.21476510067114093
Error using hinge with l2 penalty, learning rate 0.01 and n = 1000 epochs: 0.2214765100671141
Error using hinge with l2 penalty, learning rate 0.001 and n = 1000 epochs: 0.21476510067114093
Error using hinge with l1 penalty, learning rate 0.01 and n = 100 epochs: 0.4463087248322148
Error using hinge with l1 penalty, learning rate 0.001 and n = 100 epochs: 0.23825503355704697
Error using hinge with l1 penalty, learning rate 0.01 and n = 500 epochs: 0.42953020134228187
Error using hinge with l1 penalty, learning rate 0.001 and n = 500 epochs: 0.22986577181208054
Error using hinge with l1 penalty, learning rate 0.01 and

# SVM is best with 79% Accuracy

In [34]:
for alg in ['hinge', 'log', 'squared_hinge', 'perceptron']:
    for penalty in ['l2', 'l1', 'elasticnet']:
        n = 500
        alpha = 0.001
        sgd = SGDClassifier(loss=alg,
                            penalty=penalty,
                            alpha=alpha, l1_ratio=0.15,
                            fit_intercept=True,
                            max_iter=n,
                            tol=None,
                            shuffle=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=70,
                            learning_rate='optimal',
                            power_t=0.5,
                            warm_start=False,
                            average=False)
        sgd.fit(train_features, train_labels)
        predictions = sgd.predict(test_features)
        results = test_labels - predictions
        error = results[results != 0].size/results.size
        print('Error using {} with {} penalty: {}'.format(alg, penalty, error))

Error using hinge with l2 penalty: 0.21476510067114093
Error using hinge with l1 penalty: 0.22986577181208054
Error using hinge with elasticnet penalty: 0.2214765100671141
Error using log with l2 penalty: 0.22651006711409397
Error using log with l1 penalty: 0.23657718120805368
Error using log with elasticnet penalty: 0.2231543624161074
Error using squared_hinge with l2 penalty: 0.4228187919463087
Error using squared_hinge with l1 penalty: 0.348993288590604
Error using squared_hinge with elasticnet penalty: 0.4278523489932886
Error using perceptron with l2 penalty: 0.30033557046979864
Error using perceptron with l1 penalty: 0.3691275167785235
Error using perceptron with elasticnet penalty: 0.3523489932885906


# Neural Network - 80% Accuracy

In [35]:
nn = MLPClassifier(hidden_layer_sizes=(100, ),
                                activation='relu',
                                alpha=0.001,
                                batch_size='auto',
                                learning_rate='constant',
                                learning_rate_init=0.0001,
                                power_t=0.5,
                                max_iter=1000,
                                shuffle=True,
                                random_state=40,
                                verbose=False)
nn.fit(train_features, train_labels)
predictions = nn.predict(test_features)
results = test_labels - predictions
error = results[results != 0].size/results.size
print('Error using n = {} epochs: {}'.format(1000, error))

Error using n = 1000 epochs: 0.2063758389261745
