# Glove embedding method

In [1]:
import csv
import numpy as np
import re
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from sklearn import svm
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn import metrics, model_selection
from nltk.tokenize import WordPunctTokenizer



First we load our pre-trained word2vec. The first set represents a word2vec with 200 features, the second has 100 features and the last one has 50 features. 

In [None]:
wv_from_text200 = KeyedVectors.load_word2vec_format('pretrained_word2vec/glove.twitter.27B.200d.txt', binary=False)

In [2]:
wv_from_text100 = KeyedVectors.load_word2vec_format('pretrained_word2vec/glove.twitter.27B.100d.txt', binary=False)

In [None]:
wv_from_text50 = KeyedVectors.load_word2vec_format('pretrained_word2vec/glove.twitter.27B.50d.txt', binary=False)

In [3]:
# Get the vector from the pre-trained word2vec file
# If the word is not in vocab returns None
def getVecFromWord(word, wv_from_text):
    out = None
    try:
        out = wv_from_text[word]
    except:
        pass
    return out

In [4]:
def getEmbeddedMatrix(input_file, wv_from_text, feature_number):
    print('Filename {}'.format(input_file))
    
    # Compute number of lines
    num_lines = sum(1 for line in open(input_file, 'r', encoding="utf-8")) - 1

    tweet_matrix = np.zeros((num_lines, feature_number))
    y = np.zeros((num_lines,))
    
    file = csv.reader(open(input_file), delimiter=',')
    
    for num, line in enumerate(file, 1):
        if num != 1:
            y[num - 2] = line[2]
            
            # Log the lines
            if not (num % 500000):
                print(num)
            line_array = []

            # Split line into words
            for word in line[1].split(" "):
                for word_split in word.split("'"):
                    if(getVecFromWord(word_split, wv_from_text) is not None):
                        line_array.append(getVecFromWord(word_split, wv_from_text))

            # Is there any words in the vocabulary
            if(len(line_array) != 0):
                # Append the mean of the words
                tmp_array = np.asarray(line_array)
                mean_array = np.mean(tmp_array, 0)
                tweet_matrix[num - 2][:] = mean_array 
            # If no word was in the vocabulary add a 0 vector
            else:
                tweet_matrix[num - 2][:] = np.zeros((feature_number,))
    return tweet_matrix, y

## Embed vectors

We need to get our input matrix for our classifier. For that we convert each word to the corresponding vector and average the word vectors over all words of the tweet. We work directly with our cleaned twitter data.

In [5]:
input_path = 'data/crowdai_cleaned_train.csv'

# We define here the word2vec file
tx, y = getEmbeddedMatrix(input_path, wv_from_text100, 100)

Filename data/crowdai_cleaned_train.csv
500000
1000000
1500000
2000000


## Validation of the model

To validate our model we will perform a cross-validation with $k_{fold} = 6$.

In [7]:
logreg = LogisticRegression(solver = 'lbfgs')
cv_results_logreg = model_selection.cross_validate(logreg, tx, y, cv = 6, return_train_score = False)

In [8]:
ridge = RidgeClassifier()
cv_results_ridge = model_selection.cross_validate(ridge, tx, y, cv = 6, return_train_score = False)

In [6]:
sgd = SGDClassifier()
cv_results_sgd = model_selection.cross_validate(sgd, tx, y, cv = 6, return_train_score = False)



In [10]:
print('Mean accuracy for logistic regression {}'.format(np.mean(cv_results_logreg['test_score'])))
print('Standard deviation of accuracy for logistic regression {}'.format(np.std(cv_results_logreg['test_score'])))
print('Mean accuracy for ridge classifier {}'.format(np.mean(cv_results_ridge['test_score'])))
print('Standard deviation of accuracy for ridge classifier {}'.format(np.std(cv_results_ridge['test_score'])))
print('Mean accuracy for SGD classifier {}'.format(np.mean(cv_results_sgd['test_score'])))
print('Standard deviation of accuracy for SGD classifier {}'.format(np.std(cv_results_sgd['test_score'])))

Mean accuracy for logistic regression 0.7446576421648238
Standard deviation of accuracy for logistic regression 0.00141245625617569
Mean accuracy for ridge classifier 0.743555240260417
Standard deviation of accuracy for ridge classifier 0.0013717822841246485
Mean accuracy for SGD classifier 0.7428794258933974
Standard deviation of accuracy for SGD classifier 0.003125925847066541
