In [1]:
import numpy as np 
import pickle 
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load embeddings
E = np.load('embeddings.npy')

# Load vocabulary
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [2]:
# Load positive tweets
pos_tweets = []
with open('train_pos.txt') as f:
    for line in f:
        pos_tweets.append(line.split())
        
# Load negative tweets
neg_tweets = []
with open('train_neg.txt') as f:
    for line in f:
        neg_tweets.append(line.split())

In [3]:
def tweet_to_coords(tweet):
    """ Convert a tweet into the average of its embedded words.
    Technique from https://www.sciencedirect.com/science/article/pii/S1877050918301030.
    """
    
    components = []
    for word in tweet: 
        if word in vocab: 
            components.append(vocab[word])
    if (len(components) > 0):
        return E[components].mean(axis = 0)
    else: 
        return np.zeros(20) # TODO: find a representation for tweets that have only rare (<5 occurences) words
        


In [4]:
pos_tweets_coords = np.array([list(map(tweet_to_coords, pos_tweets))]).squeeze()
neg_tweets_coords = np.array([list(map(tweet_to_coords, neg_tweets))]).squeeze()
X = np.concatenate((pos_tweets_coords, neg_tweets_coords))
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

random_idxs = np.random.permutation(len(y))
X = X[random_idxs]
y = y[random_idxs]

N_train = int(0.8*len(y))

X_train, X_test = X[:N_train], X[N_train:]
y_train, y_test = y[:N_train], y[N_train:]

In [5]:
clf = make_pipeline(StandardScaler(), linear_model.SGDClassifier(loss = 'log', max_iter=1000, tol=1e-4, verbose = True ))


clf.fit(X_train, y_train)
(clf.predict(X_test) == y_test).mean()

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier(loss='log', tol=0.0001))])

In [9]:
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', tol = 1e-3, verbose = True, max_iter = 10000))

clf.fit(X_train, y_train)
(clf.predict(X_test) == y_test).mean()

[LibSVM]



0.5829

In [25]:
X_train.shape

(160000, 20)

In [28]:
from sklearn.neural_network import MLPClassifier
clf = make_pipeline(StandardScaler(), MLPClassifier(solver='lbfgs', alpha=1e-2, \
                                                    hidden_layer_sizes=(10, 5), random_state=0, verbose = True, max_iter = 1000))

clf.fit(X_train, y_train)
(clf.predict(X_test) == y_test).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.62475

In [27]:
from sklearn.svm import LinearSVC

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5, loss = 'squared_hinge', dual = False))
clf.fit(X_train, y_train)
(clf.predict(X_test) == y_test).mean()

0.601775