### KNN from embedded sentences

In [1]:
from utils import *

In [2]:
x_train, y_train = load_embeddings('../out/glove_train.npy', '../out/labels_train.npy')

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
model = LogisticRegression()

In [6]:
model.fit(x_train, y_train)



KeyboardInterrupt: 

In [24]:
preds = model.predict(X_test)

In [11]:
model.score(X_test, y_test)

0.8041691942866597

In [3]:
x_val, y_val = load_embeddings('../out/glove_val.npy', '../out/labels_val.npy')

In [39]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

In [41]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
knn.score(x_val, y_val)

In [None]:
y_pred = classifier.predict(x_val)

### Getting sentence embeddings from GloVe model

In [5]:
from utils import *
from glove import Glove

In [6]:
glove = Glove.load('../models/glove.model')

In [33]:
tweets = load_tweets('../data/ft_val.txt')

In [8]:
import numpy as np

In [9]:
# add 0 for padding / unknown
word_vectors = np.concatenate((np.zeros((1,glove.word_vectors.shape[1])),glove.word_vectors), axis=0)
vocab = {w:glove.dictionary[w]+1 for w in glove.dictionary}
assert np.all(word_vectors[vocab['lol']] == glove.word_vectors[glove.dictionary['lol']])

In [25]:
DIM = word_vectors.shape[1]
def embed_tweet(tweet):
    embedding = np.zeros((DIM,))
    words = tweet.split()
    c = 0
    for w in words:
        if w in vocab: c += 1
        embedding += word_vectors[vocab.get(w, 0)] # add word vector or only 0s if unknown
    return embedding / c if c > 0 else embedding # return average of (known) words

In [34]:
embedded = np.array([embed_tweet(t) for t in tweets])

In [35]:
np.save('../out/glove_val', embedded)

In [15]:
labels = np.array([1 if '__label__1' in t else -1 for t in tweets])

In [16]:
np.save('../out/labels_val', labels)

* * *

In [1]:
from utils import *

In [67]:
X,y = load_tweets('../data/clean/train_full.txt', True)

In [68]:
labels = ['__label__1' if l == 1 else '__label__-1' for l in y]

In [69]:
with open('../data/clean_train_full_fasttext.txt', 'w') as f:
    for l,x in zip(labels, X):
        f.write(l + ' ' + x + '\n')

* * *

In [2]:
all_tweets = load_tweets('../data/clean/train_full_fasttext.txt')

In [27]:
X_train, X_val = split_data(all_tweets, 0.8, seed=0)

In [28]:
TRAIN_PATH = '../data/ft_train.txt'
save_tweets(TRAIN_PATH, X_train)
VAL_PATH = '../data/ft_val.txt'
save_tweets(VAL_PATH, X_val)

In [1]:
import fasttext
MODEL_PATH = '../models/ft_auto.bin' # CHANGE THIS PATH
TRAIN_PATH = '../data/ft_train.txt'
VAL_PATH = '../data/ft_val.txt'
model = fasttext.load_model(MODEL_PATH)




In [2]:
model.test(VAL_PATH)

(454097, 0.8488494748919284, 0.8488494748919284)

In [3]:
model.test(TRAIN_PATH)

(1816385, 0.8687783702243742, 0.8687783702243742)

#### Get sentence embedding from fasttext

In [20]:
from utils import *

In [23]:
train = load_tweets(TRAIN_PATH)
val = load_tweets(VAL_PATH)

In [25]:
train_embed = np.array([model.get_sentence_vector(t) for t in train])
val_embed = np.array([model.get_sentence_vector(t) for t in val])

In [28]:
np.save('../out/fasttext_train', train_embed)
np.save('../out/fasttext_val', val_embed)

In [53]:
def predict_all(model, test_data):
    labels, probas = [], []
    for x in test_data:
        l, p = model.predict(x)
        labels.append(int(l[0][9:]))
        probas.append(p[0])
    return labels, probas

In [191]:
generate_submission('../out/submission_ft_200.csv', labels)

In [113]:
params = {
    'lr' : 0.1,
    'epoch': 5,
    'loss' : 'softmax',
    'wordNgrams': 3,
    'dim' : 200,
    'minCount' : 5
}

print(params)
for i in range(5):
    model = fasttext.train_supervised(TRAIN_PATH, **params)
    print(model.test(VAL_PATH))

params['minCount'] = 0

print(params)
for i in range(5):
    model = fasttext.train_supervised(TRAIN_PATH, **params)
    print(model.test(VAL_PATH))

params['loss'] = 'ns'
params['minCount'] = 5

print(params)
for i in range(5):
    model = fasttext.train_supervised(TRAIN_PATH, **params)
    print(model.test(VAL_PATH))

{'lr': 0.1, 'epoch': 5, 'loss': 'softmax', 'wordNgrams': 3, 'dim': 200, 'minCount': 5}
(454097, 0.8372131945377309, 0.8372131945377309)
(454097, 0.8143414292541021, 0.8143414292541021)
(454097, 0.8169443973424181, 0.8169443973424181)
(454097, 0.8324058516132016, 0.8324058516132016)
(454097, 0.8256980336800287, 0.8256980336800287)
{'lr': 0.1, 'epoch': 5, 'loss': 'softmax', 'wordNgrams': 3, 'dim': 200, 'minCount': 0}
(454097, 0.8369621468540862, 0.8369621468540862)
(454097, 0.8178318729258286, 0.8178318729258286)
(454097, 0.7235987024798667, 0.7235987024798667)
(454097, 0.8331171533835282, 0.8331171533835282)
(454097, 0.8265040288748879, 0.8265040288748879)
{'lr': 0.1, 'epoch': 5, 'loss': 'ns', 'wordNgrams': 3, 'dim': 200, 'minCount': 5}
(454097, 0.8120357544753654, 0.8120357544753654)
(454097, 0.6906696146418057, 0.6906696146418057)
(454097, 0.7327619429328976, 0.7327619429328976)
(454097, 0.8007275978480368, 0.8007275978480368)
(454097, 0.8077305069181254, 0.8077305069181254)


In [223]:
params = {
    'lr' : 0.05,
    'epoch': 5,
    'loss' : 'softmax',
    'wordNgrams': 3,
    'dim' : 200,
    'minCount' : 5
}

TRAIN_PATH = '../data/ft_train.txt'
VAL_PATH = '../data/ft_val.txt'

print(params)
for i in range(2):
    X_train, X_val = split_data(all_tweets, 0.8, seed=i)
    save_tweets(TRAIN_PATH, X_train)
    save_tweets(VAL_PATH, X_val)
    model = fasttext.train_supervised(TRAIN_PATH, **params)
    print(model.test(VAL_PATH))

{'lr': 0.05, 'epoch': 5, 'loss': 'softmax', 'wordNgrams': 3, 'dim': 200, 'minCount': 5}
(454097, 0.908658282261279, 0.908658282261279)
(454097, 0.9091537711105777, 0.9091537711105777)


In [224]:
model.save_model('../models/ft_lr005')

In [10]:
import fasttext
TRAIN_PATH = '../data/ft_train.txt'
VAL_PATH = '../data/ft_val.txt'

params = {
    'lr' : 0.05,
    'epoch': 5,
    'loss' : 'softmax',
    'wordNgrams': 3,
    'dim' : 200,
    'minCount' : 5,
    'verbose' : 2
}

In [9]:
test_data = load_tweets('../data/clean/test.txt')

In [11]:
nb_iter = 1
final_probas = np.zeros((len(test_data),))
pos_count = np.zeros((len(test_data),))
neg_count = np.zeros((len(test_data),))
for i in range(nb_iter):
    print('\rTraining %d / %d' % (i+1, nb_iter), end='')
    
    # only train on part of the data for each iteration (say 80 %) ...
    X_train, X_val = split_data(all_tweets, 0.8, seed=i)
    save_tweets(TRAIN_PATH, X_train)
    save_tweets(VAL_PATH, X_val)
    model = fasttext.train_supervised(TRAIN_PATH, **params)
    
    # ... or train on full dataset
    #model = fasttext.train_supervised('../data/clean/train_full_fasttext.txt', **params)
    
    # we can also check the accuracy on validation data (slows thigs down)
    # print('\r             Val acc: %g' % (model.test(VAL_PATH)[1]), end='')
    
    ls, ps = predict_all(model, test_data)
    ls = np.array(ls)
    pos_count += 1 * (ls > 0)
    neg_count += 1 * (ls < 0)
    final_probas += (ls * np.array(ps))
final_probas /= nb_iter
# final_labels = [1 if x >= 0 else -1 for x in final_probas]
# generate_submission('../out/submission_ft_'+str(nb_iter)+'_iter.csv', final_labels)

Training 1 / 1

In [29]:
model = fasttext.train_supervised(TRAIN_PATH)

In [54]:
l, p = predict_all(model, test_data)

In [59]:
generate_submission('../out/submission_ft_default.csv', l)

In [38]:
model.test(VAL_PATH, k=3)

(454097, 0.5, 1.0)

In [None]:
model.test(VAL_PATH)

In [11]:
np.save('../out/final_probas_7_iter', final_probas)

In [20]:
majority = pos_count > neg_count
majority = 2 * majority - 1

array([-1, -1, -1, ..., -1,  1, -1])

In [22]:
final_labels = [1 if x >= 0 else -1 for x in final_probas]

In [23]:
sum(final_labels != majority)

16

In [25]:
generate_submission('../out/submission_ft_7_iter_majority.csv', majority)

* * *

In [1]:
import sent2vec
import numpy
import pickle
model = sent2vec.Sent2vecModel()
model.load_model('../models/embed-model-full.bin') # The model can be sent2vec or cbow-c+w-ngrams
uni_embs, vocab = model.get_unigram_embeddings() # Return the full unigram embedding matrix

#with open('full-vocab.pkl', 'wb') as f:
#    pickle.dump(vocab, f)

#numpy.save('full-vocab-matrix', uni_embs)

In [2]:
from utils import *

In [3]:
X = load_tweets('../data/clean_train_no_label.txt')

In [4]:
len(vocab)

42566

In [5]:
uni_embs.shape

(42566, 700)

In [6]:
word2vec = dict()
for i, w in enumerate(vocab):
    word2vec[w] = uni_embs[i]

In [21]:
word2index = {w:i for i,w in enumerate(vocab)}

In [18]:
max_len = len(max(X, key=lambda x: len(x)))

In [69]:
X_nums = np.zeros((len(X),max_len), dtype='int')
for i,x in enumerate(X):
    for j,w in enumerate(x.split()):
        idx = word2index.get(w, 1)
        X_nums[i][j] = idx

In [70]:
np.save('../out/clean-train-embedded', X_nums)

In [53]:
X_train = load_tweets('../data/clean_test.txt')

In [63]:
X_train_nums = np.zeros((len(X_train),max_len), dtype='int')
for i,x in enumerate(X_train):
    for j,w in enumerate(x.split()):
        idx = word2index.get(w, 1)
        X_train_nums[i][j] = idx

In [71]:
np.save('../out/clean-test-embedded', X_train_nums)

* * *

In [56]:
from utils import *
import pickle

In [2]:
X,y = load_embeddings('../out/embeddings_train.npy', '../out/clean_train_labels.npy')

In [3]:
y = np.array(one_minus_one_labels_to_one_zero(y))

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression()

In [8]:
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
preds = model.predict(X_test)

In [11]:
model.score(X_test, y_test)

0.8041691942866597

In [12]:
TEST = load_embeddings('../out/embeddings_test.npy')

In [13]:
preds = model.predict(TEST)

In [15]:
preds = one_zero_labels_to_one_minus_one(preds)

In [17]:
generate_submission('../out/sub_lr.csv', preds)

In [57]:
with open('../models/logistic_regression.pkl', 'wb') as f:
    pickle.dump(model, f)

* * *

In [59]:
max_depth = 10

In [49]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)

In [50]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [51]:
y_pred = tree.predict(X_test)

In [52]:
tree.score(X_test, y_test)

0.6817404731704627

In [53]:
preds = tree.predict(TEST)
preds = one_zero_labels_to_one_minus_one(preds)

In [55]:
generate_submission('../out/sub_dt.csv', preds)

In [60]:
with open('../models/decision_tree_'+str(max_depth)+'.pkl', 'wb') as f:
    pickle.dump(tree, f)

* * *

In [64]:
from sklearn.svm import SVC
svm = SVC(random_state=42)

In [None]:
svm.fit(X_train, y_train)



In [None]:
svm.score(X_test, y_test)

In [None]:
preds = svm.predict(TEST)
preds = one_zero_labels_to_one_minus_one(preds)

In [None]:
generate_submission('../out/sub_svm.csv', preds)

In [None]:
with open('../models/svm.pkl', 'wb') as f:
    pickle.dump(svm, f)