In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
from gensim.models import Word2Vec

In [3]:
# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)


### Define useful functions

In [5]:
def get_embedding_dataframe(model_asia_2500):
    '''Function to get the embedding dataframe from a word2vec model'''
    vocabulary = model_asia_2500.wv.vocab

    vectors = {}
    for word in vocabulary:
        vectors[word] = model_asia_2500[word]
        if len(vectors)%100000 == 0:
            print(len(vectors))
    print('Dataframe Done!')

    embedding_a2500 = pd.DataFrame.from_dict(vectors, orient='index')
    return embedding_a2500

def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

def check_in_vocab(pos_words, embedding_a2500):
    '''Function to keep positive or negative words that are in the vocabulary'''
    counter = 0
    checked_words = []
    for word in pos_words:
        if word not in list(embedding_a2500.index):
            print('{} not in list'.format(word))
        else:
            checked_words.append(word)
            #print('{} checked words'.format(len(checked_words)))
    return checked_words

def get_sa_accuracy(pos_words, neg_words, embedding_a2500):
    '''Function to get the accuracy of a sentiment analysis log classifier trained with the passed embedding'''
    
    # Keep positive and negative words that are in the embedding
    #print("Start with {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    pos_words = check_in_vocab(pos_words, embedding_a2500)
    neg_words = check_in_vocab(neg_words, embedding_a2500)
    print("After check: {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    
    # Get positive and negative vectors
    pos_vectors = embedding_a2500.loc[pos_words].dropna()
    neg_vectors = embedding_a2500.loc[neg_words].dropna()
    
    vectors = pd.concat([pos_vectors, neg_vectors])
    targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
    labels = list(pos_vectors.index) + list(neg_vectors.index)
    
    train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
        train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
    
    sentiment_classifier = SGDClassifier(loss='log', random_state=0)
    sentiment_classifier.fit(train_vectors, train_targets)
    
    accuracy = accuracy_score(sentiment_classifier.predict(test_vectors), test_targets)
    return accuracy

In [6]:
### Get all embedding to be evaluated 

model_black_2500 = Word2Vec.load("../wikipedia_corpus/modelos_vectores/debiased_black/wiki_cds_bw_2500.txt.model")
embedding_black_2500 = get_embedding_dataframe(model_black_2500)

pos_words = load_lexicon('data/positive-words.txt')
neg_words = load_lexicon('data/negative-words.txt')

accuracy_b25 = get_sa_accuracy(pos_words, neg_words, embedding_black_2500)
print('Debiased black 2500 accuracy: {}'.format(accuracy_b25))

  import sys


100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
Dataframe Done!
a+ not in list
accessable not in list
accomodative not in list
achievible not in list
afordable not in list
all-around not in list
amiabily not in list
awsome not in list
beautifullly not in list
believeable not in list
benifits not in list
best-known not in list
best-performing not in list
best-selling not in list
better-known not in list
better-than-expected not in list
beutifully not in list
brand-new not in list
cashbacks not in list
clear-cut not in list
convienient not in list
cost-effective not in list
cost-saving not in list
counter-attack not in list
counter-attacks not in list
cure-all not in list
dead-cheap not in list
dead-on not in list
deginified not in list
dirt-cheap not in list
dummy-proof not in list
easy-to-use not in list
ecenomical not in list
energy-efficient not in li

less-developed not in list
lesser-known not in list
life-threatening not in list
little-known not in list
lividly not in list
long-time not in list
long-winded not in list
low-rated not in list
lunaticism not in list
martyrdom-seeking not in list
melodramatically not in list
misbecome not in list
misbecoming not in list
miserableness not in list
misrepresentation not in list
mistified not in list
mistrustfully not in list
misunderstanding not in list
misunderstandings not in list
multi-polarization not in list
muscle-flexing not in list
non-confidence not in list
one-sided not in list
onerously not in list
over-acted not in list
over-awe not in list
over-balanced not in list
over-hyped not in list
over-priced not in list
over-valuation not in list
overpayed not in list
oversimplification not in list
overzelous not in list
paralize not in list
perfidity not in list
polemize not in list
polution not in list
quarrellous not in list
quarrellously not in list
rantingly not in list
regreted 