# Logistic regression (using GloVe)
The idea behind this approach is to average the word vectors over every tweet, and use this average vectors to train logistic regression. <br> Cells that need user input have a <font color='blue'> blue title</font>

## I) Imports

In [1]:
import numpy as np
import pickle

from logreg import *
from helpers import *

%load_ext autoreload
%autoreload 2

## II) Prepare features

### <font color='blue'> Define file paths</font>

In [2]:
# path to embeddings
path_embeddings = 'embeddings/embeddings200.npy'
# path to tweets
path_train_pos = 'data/train_pos.txt'
path_train_neg = 'data/train_neg.txt'
# filename of the submission file
submission_filename = 'logreg_submission.csv'

### <font color='blue'> Define hyperparameters</font>

In [3]:
epochs = 60
batch_size = 1000
gamma = 0.0001
lambda_ = 0.01
print_every = int(50000 / batch_size)

#chose wheter to use offset, standardization
standardize = False
offset = True

### Load our GloVe word embeddings from file ...

In [4]:
embeddings = np.load(path_embeddings)
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

### Average word vectors over tweets

In [None]:
'''
Appending a row for each tweet is impracticable slow. 
However, we can not know in advance the number of tweets we will
be appended (this is because we skip tweets fr which we have no embeddings).
therefore we allocate a too big array fr x_train and cut wht's too much
in the end.
'''

# Process training tweets
allocate_columns = 3000000
x_train = np.zeros((allocate_columns, embeddings.shape[1]))
y_train = np.zeros(allocate_columns)
counter = 0

with open(path_train_pos) as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = 1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
with open(path_train_neg) as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = -1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_train and y_train
y_train = y_train[np.nonzero(y_train)]
x_train = x_train[np.nonzero(y_train)]
                               
# Shuffle tweets
x_train, y_train = shuffle(x_train, y_train)

100000  tweets processed


In [None]:
# Process test tweets
allocate_columns = 100000
x_submission = np.zeros((allocate_columns, embeddings.shape[1]))
embeddings_mean = np.expand_dims(np.mean(embeddings, axis=0), axis=0)
counter = 0

with open('test_data.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        # filter out the IDs and first comma
        tweet = line[(line.index(",")+1):]
        for word in tweet.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_submission[counter, :] = mean
        else:
            # in case that we have no embedding for any word of the tweet
            # just use the overall mean of the embeddings
            x_submission[counter, :] = embeddings_mean
        counter += 1
        if counter % 5000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_submission
x_submission = x_submission[np.nonzero(x_submission[:, 1])]

5000  tweets processed
10000  tweets processed


## III) Train the model

In [None]:
# set aside a small portion for validation
testset = 10000

x_test = x_train[0:testset, :]
y_test = y_train[0:testset]
x_train_log = x_train[testset + 1:, :]
y_train_log = y_train[testset + 1:]

if standardize == True:
    x_train_log, mean, std = standardize(x_train_log)

if offset == True:
    x_train_log = add_offset(x_train_log)

# train using logistic regression (SGD)
initial_w = np.random.rand(x_train_log.shape[1])
weights, loss = reg_logistic_regression(y_train_log, x_train_log, initial_w, epochs, batch_size, gamma, lambda_, print_every)

# free up memory
del x_train_log
del y_train_log

epoch	 1 	loss:  853.4164276410547
epoch	 1 	loss:  795.5189160441892
epoch	 1 	loss:  789.7773493339978
epoch	 2 	loss:  779.3013834510737
epoch	 2 	loss:  778.2169460365299
epoch	 2 	loss:  768.5239878244789
epoch	 3 	loss:  759.6393048285314
epoch	 3 	loss:  758.4506480348911
epoch	 3 	loss:  754.7865378106121
epoch	 4 	loss:  745.6170809548775
epoch	 4 	loss:  741.1574477338334
epoch	 4 	loss:  741.8580200421977
epoch	 5 	loss:  732.6044712091044
epoch	 5 	loss:  731.2572481762014
epoch	 5 	loss:  728.2809328913614
epoch	 6 	loss:  721.42103260051
epoch	 6 	loss:  718.0030269014238
epoch	 6 	loss:  717.3540687766836
epoch	 7 	loss:  712.257444169504
epoch	 7 	loss:  707.3484109267555
epoch	 7 	loss:  708.3399215215265
epoch	 8 	loss:  701.8345628840065
epoch	 8 	loss:  702.3227469942924
epoch	 8 	loss:  701.4123006237901
epoch	 9 	loss:  695.2529400678187
epoch	 9 	loss:  693.3851979820482
epoch	 9 	loss:  692.8900604718871
epoch	 10 	loss:  689.367144046254
epoch	 10 	loss:  684.3

## IV) Test predictions

### Tests on a local validation set

In [None]:
if standardize == True:
    x_test_log = standardize_test(x_test, mean, std)

if offset == True:
    x_test_log = add_offset(x_test)

y_pred = predict_logistic_labels(weights, x_test_log)
accuracy = get_accuracy(y_test, y_pred)
print("Accuracy on validation set: {:.4f}".format(accuracy))

### Predict labels for the test dataset, prepare submission csv file

In [None]:
if standardize == True:
    x_submission_log = standardize_test(x_submission_log, mean, std)

if offset == True:
    x_submission_log = add_offset(x_submission)

y_submission = predict_logistic_labels(weights, x_submission_log)

# we need to add IDs to meet the submission interface requirements
ids = np.arange(len(y_submission)) + 1
create_csv_submission(ids, y_submission, submission_filename)