In [20]:
import fasttext

model = fasttext.train_unsupervised('stage2data/train_all_full_u.txt', model='cbow')

In [None]:
model.save_model("stage2data/fasttext_cbow_train_full_u.bin")


In [2]:
import fasttext
model = fasttext.load_model("stage2data/fasttext_cbow_train_full_u.bin")



In [3]:
input_matrix = model.get_input_matrix()

In [7]:
import numpy as np 

np.save('stage2data/cbow_train_all_full_u.npy', input_matrix, allow_pickle=True, fix_imports=True)

In [8]:
# Load positive tweets
pos_tweets = []
with open('stage2data/train_pos_full_u.txt', encoding = 'utf-8') as f:
    for line in f:
        pos_tweets.append(model.get_sentence_vector(line[:-1]))
        
# Load negative tweets
neg_tweets = []
with open('stage2data/train_neg_full_u.txt', encoding = 'utf-8') as f:
    for line in f:
        neg_tweets.append(model.get_sentence_vector(line[:-1]))
        
pos_tweets = np.array(pos_tweets)
neg_tweets = np.array(neg_tweets)

In [9]:
np.save('stage2data/cbow_train_pos_full_u.npy', pos_tweets, allow_pickle=True, fix_imports=True)
np.save('stage2data/cbow_train_neg_full_u.npy', neg_tweets, allow_pickle=True, fix_imports=True)

In [10]:
# Load test tweets
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        test_tweets.append(model.get_sentence_vector(line[:-1]))
test_tweets = np.array(test_tweets)
np.save('stage2data/cbow_test.npy', test_tweets, allow_pickle=True, fix_imports=True)

In [11]:
import fasttext
model = fasttext.load_model("models/fasttext_cbow_train.bin")



In [12]:
import numpy as np
import pandas as pd

# Load positive tweets
pos_tweets = []
with open('twitter-datasets/train_pos_full_u.txt', encoding = 'utf-8') as f:
    for line in f:
        pos_tweets.append(model.get_sentence_vector(line[:-1]))
        
# Load negative tweets
neg_tweets = []
with open('twitter-datasets/train_neg_full_u.txt', encoding = 'utf-8') as f:
    for line in f:
        neg_tweets.append(model.get_sentence_vector(line[:-1]))
        
pos_tweets = np.array(pos_tweets)
neg_tweets = np.array(neg_tweets)

In [13]:
all_tweets = np.concatenate((pos_tweets, neg_tweets))

In [14]:
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

random_idxs = np.random.permutation(len(y))

all_tweets = all_tweets[random_idxs]

y = y[random_idxs]

N_train = int(0.8*len(y))

train, val = all_tweets[:N_train], all_tweets[N_train:]
y_train, y_val = y[:N_train], y[N_train:]

In [15]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(train, y_train)
train_acc = (clf.predict(train) == y_train).mean()
val_acc = (clf.predict(val) == y_val).mean()
print('Training set accuracy: {:.2f}% / validation set: {:.2f}%'.format(100*train_acc, 100*val_acc))

Training set accuracy: 78.42% / validation set: 78.50%
