In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
from gensim.models import Word2Vec

In [3]:
# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)


### Define useful functions

In [5]:
def get_embedding_dataframe(model_asia_2500):
    '''Function to get the embedding dataframe from a word2vec model'''
    vocabulary = model_asia_2500.wv.vocab

    vectors = {}
    for word in vocabulary:
        vectors[word] = model_asia_2500[word]
        if len(vectors)%100000 == 0:
            print(len(vectors))
    print('Dataframe Done!')

    embedding_a2500 = pd.DataFrame.from_dict(vectors, orient='index')
    return embedding_a2500

def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

def check_in_vocab(pos_words, embedding_a2500):
    '''Function to keep positive or negative words that are in the vocabulary'''
    counter = 0
    checked_words = []
    for word in pos_words:
        if word not in list(embedding_a2500.index):
            print('{} not in list'.format(word))
        else:
            checked_words.append(word)
            #print('{} checked words'.format(len(checked_words)))
    return checked_words

def get_sa_accuracy(pos_words, neg_words, embedding_a2500):
    '''Function to get the accuracy of a sentiment analysis log classifier trained with the passed embedding'''
    
    # Keep positive and negative words that are in the embedding
    #print("Start with {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    pos_words = check_in_vocab(pos_words, embedding_a2500)
    neg_words = check_in_vocab(neg_words, embedding_a2500)
    print("After check: {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    
    # Get positive and negative vectors
    pos_vectors = embedding_a2500.loc[pos_words].dropna()
    neg_vectors = embedding_a2500.loc[neg_words].dropna()
    
    vectors = pd.concat([pos_vectors, neg_vectors])
    targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
    labels = list(pos_vectors.index) + list(neg_vectors.index)
    
    train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
        train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
    
    sentiment_classifier = SGDClassifier(loss='log', random_state=0)
    sentiment_classifier.fit(train_vectors, train_targets)
    
    accuracy = accuracy_score(sentiment_classifier.predict(test_vectors), test_targets)
    return accuracy

In [None]:
### Get all embedding to be evaluated 

model_black_2500 = Word2Vec.load("../wikipedia_corpus/modelos_vectores/debiased_black/wiki_cds_bw_2500.txt.model")
embedding_black_2500 = get_embedding_dataframe(model_black_2500)

pos_words = load_lexicon('data/positive-words.txt')
neg_words = load_lexicon('data/negative-words.txt')

accuracy_b25 = get_sa_accuracy(pos_words, neg_words, embedding_black_2500)
print('Debiased black 2500 accuracy: {}'.format(accuracy_b25))