In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
from gensim.models import Word2Vec

In [2]:
# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)


### Define useful functions

In [3]:
def get_embedding_dataframe(model_asia_2500):
    '''Function to get the embedding dataframe from a word2vec model'''
    vocabulary = model_asia_2500.wv.vocab

    vectors = {}
    for word in vocabulary:
        vectors[word] = model_asia_2500[word]
        if len(vectors)%100000 == 0:
            print(len(vectors))
    print('Dataframe Done!')

    embedding_a2500 = pd.DataFrame.from_dict(vectors, orient='index')
    return embedding_a2500

def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

def check_in_vocab(pos_words, embedding_a2500):
    '''Function to keep positive or negative words that are in the vocabulary'''
    counter = 0
    checked_words = []
    for word in pos_words:
        if word not in list(embedding_a2500.index):
            print('{} not in list'.format(word))
        else:
            checked_words.append(word)
            #print('{} checked words'.format(len(checked_words)))
    return checked_words

def get_sa_accuracy(pos_words, neg_words, embedding_a2500):
    '''Function to get the accuracy of a sentiment analysis log classifier trained with the passed embedding'''
    
    # Keep positive and negative words that are in the embedding
    #print("Start with {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    pos_words = check_in_vocab(pos_words, embedding_a2500)
    neg_words = check_in_vocab(neg_words, embedding_a2500)
    print("After check: {} positive words, {} negative words".format(len(pos_words), len(neg_words)))
    
    # Get positive and negative vectors
    pos_vectors = embedding_a2500.loc[pos_words].dropna()
    neg_vectors = embedding_a2500.loc[neg_words].dropna()
    
    vectors = pd.concat([pos_vectors, neg_vectors])
    targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
    labels = list(pos_vectors.index) + list(neg_vectors.index)
    
    train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
        train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
    
    sentiment_classifier = SGDClassifier(loss='log', random_state=0)
    sentiment_classifier.fit(train_vectors, train_targets)
    
    accuracy = accuracy_score(sentiment_classifier.predict(test_vectors), test_targets)
    return accuracy

In [5]:
### Get all embedding to be evaluated 

model_hispanic = Word2Vec.load("../wikipedia_corpus/modelos_vectores/debiased_hispanic/cds/wiki_cds_hw.txt.model")
embedding_hispanic = get_embedding_dataframe(model_hispanic)

pos_words = load_lexicon('data/positive-words.txt')
neg_words = load_lexicon('data/negative-words.txt')

accuracy_h = get_sa_accuracy(pos_words, neg_words, embedding_hispanic)
print('Debiased hispanic accuracy: {}'.format(accuracy_h))

  import sys


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390

328 checked words
329 checked words
330 checked words
331 checked words
332 checked words
333 checked words
334 checked words
335 checked words
convienient not in list
336 checked words
337 checked words
338 checked words
339 checked words
340 checked words
341 checked words
342 checked words
343 checked words
344 checked words
345 checked words
cost-effective not in list
cost-saving not in list
counter-attack not in list
counter-attacks not in list
346 checked words
347 checked words
348 checked words
349 checked words
350 checked words
351 checked words
352 checked words
353 checked words
354 checked words
355 checked words
356 checked words
357 checked words
358 checked words
359 checked words
cure-all not in list
360 checked words
361 checked words
362 checked words
363 checked words
364 checked words
365 checked words
366 checked words
367 checked words
368 checked words
369 checked words
370 checked words
371 checked words
372 checked words
373 checked words
dead-cheap not in lis

734 checked words
735 checked words
736 checked words
737 checked words
738 checked words
739 checked words
740 checked words
741 checked words
742 checked words
743 checked words
744 checked words
745 checked words
746 checked words
747 checked words
748 checked words
749 checked words
750 checked words
751 checked words
752 checked words
753 checked words
754 checked words
755 checked words
756 checked words
757 checked words
758 checked words
759 checked words
760 checked words
761 checked words
762 checked words
763 checked words
764 checked words
765 checked words
766 checked words
767 checked words
god-given not in list
god-send not in list
768 checked words
769 checked words
770 checked words
771 checked words
772 checked words
773 checked words
774 checked words
775 checked words
776 checked words
777 checked words
778 checked words
779 checked words
780 checked words
781 checked words
782 checked words
783 checked words
784 checked words
785 checked words
786 checked words
787

1142 checked words
1143 checked words
1144 checked words
1145 checked words
1146 checked words
1147 checked words
1148 checked words
1149 checked words
1150 checked words
1151 checked words
1152 checked words
1153 checked words
1154 checked words
1155 checked words
1156 checked words
1157 checked words
1158 checked words
1159 checked words
1160 checked words
1161 checked words
pain-free not in list
1162 checked words
1163 checked words
1164 checked words
1165 checked words
1166 checked words
pamperedly not in list
pamperedness not in list
1167 checked words
1168 checked words
1169 checked words
1170 checked words
1171 checked words
1172 checked words
1173 checked words
1174 checked words
1175 checked words
1176 checked words
1177 checked words
1178 checked words
1179 checked words
1180 checked words
1181 checked words
1182 checked words
1183 checked words
1184 checked words
1185 checked words
1186 checked words
1187 checked words
1188 checked words
1189 checked words
1190 checked words

1544 checked words
1545 checked words
1546 checked words
1547 checked words
1548 checked words
1549 checked words
1550 checked words
1551 checked words
1552 checked words
1553 checked words
1554 checked words
1555 checked words
1556 checked words
1557 checked words
1558 checked words
1559 checked words
1560 checked words
1561 checked words
1562 checked words
1563 checked words
state-of-the-art not in list
1564 checked words
1565 checked words
1566 checked words
1567 checked words
1568 checked words
1569 checked words
1570 checked words
1571 checked words
1572 checked words
1573 checked words
1574 checked words
1575 checked words
stellarly not in list
1576 checked words
1577 checked words
1578 checked words
1579 checked words
1580 checked words
1581 checked words
1582 checked words
1583 checked words
1584 checked words
1585 checked words
1586 checked words
1587 checked words
1588 checked words
1589 checked words
1590 checked words
1591 checked words
1592 checked words
1593 checked words

69 checked words
70 checked words
71 checked words
72 checked words
73 checked words
74 checked words
75 checked words
76 checked words
77 checked words
78 checked words
aggrivation not in list
79 checked words
80 checked words
81 checked words
82 checked words
83 checked words
84 checked words
85 checked words
86 checked words
87 checked words
88 checked words
89 checked words
90 checked words
91 checked words
92 checked words
93 checked words
94 checked words
95 checked words
96 checked words
97 checked words
98 checked words
99 checked words
100 checked words
101 checked words
102 checked words
103 checked words
104 checked words
105 checked words
106 checked words
107 checked words
108 checked words
109 checked words
110 checked words
111 checked words
112 checked words
113 checked words
114 checked words
115 checked words
116 checked words
117 checked words
118 checked words
angriness not in list
119 checked words
120 checked words
121 checked words
122 checked words
123 checked w

484 checked words
485 checked words
486 checked words
cash-strapped not in list
487 checked words
488 checked words
489 checked words
490 checked words
cataclysmal not in list
491 checked words
492 checked words
493 checked words
494 checked words
495 checked words
catastrophically not in list
catastrophies not in list
496 checked words
497 checked words
498 checked words
499 checked words
500 checked words
501 checked words
502 checked words
503 checked words
504 checked words
505 checked words
506 checked words
507 checked words
508 checked words
509 checked words
510 checked words
511 checked words
512 checked words
513 checked words
514 checked words
515 checked words
516 checked words
517 checked words
518 checked words
519 checked words
520 checked words
521 checked words
522 checked words
523 checked words
524 checked words
525 checked words
526 checked words
527 checked words
528 checked words
529 checked words
530 checked words
531 checked words
532 checked words
533 checked w

918 checked words
919 checked words
920 checked words
921 checked words
922 checked words
923 checked words
924 checked words
925 checked words
926 checked words
927 checked words
928 checked words
929 checked words
930 checked words
desititute not in list
931 checked words
932 checked words
933 checked words
934 checked words
935 checked words
936 checked words
937 checked words
938 checked words
939 checked words
940 checked words
941 checked words
942 checked words
943 checked words
944 checked words
945 checked words
946 checked words
947 checked words
948 checked words
949 checked words
950 checked words
951 checked words
952 checked words
953 checked words
destains not in list
954 checked words
955 checked words
956 checked words
957 checked words
958 checked words
959 checked words
960 checked words
961 checked words
962 checked words
963 checked words
964 checked words
965 checked words
966 checked words
967 checked words
detestably not in list
968 checked words
969 checked wor

1315 checked words
1316 checked words
1317 checked words
1318 checked words
1319 checked words
1320 checked words
1321 checked words
1322 checked words
1323 checked words
1324 checked words
1325 checked words
1326 checked words
1327 checked words
1328 checked words
1329 checked words
1330 checked words
1331 checked words
1332 checked words
1333 checked words
1334 checked words
1335 checked words
1336 checked words
1337 checked words
1338 checked words
1339 checked words
1340 checked words
1341 checked words
1342 checked words
1343 checked words
1344 checked words
1345 checked words
1346 checked words
1347 checked words
1348 checked words
1349 checked words
1350 checked words
1351 checked words
election-rigger not in list
1352 checked words
1353 checked words
1354 checked words
1355 checked words
1356 checked words
1357 checked words
1358 checked words
1359 checked words
1360 checked words
1361 checked words
1362 checked words
1363 checked words
1364 checked words
1365 checked words
136

1720 checked words
1721 checked words
1722 checked words
1723 checked words
1724 checked words
1725 checked words
1726 checked words
1727 checked words
get-rich not in list
1728 checked words
1729 checked words
1730 checked words
1731 checked words
1732 checked words
1733 checked words
1734 checked words
1735 checked words
1736 checked words
gimmicking not in list
1737 checked words
1738 checked words
1739 checked words
1740 checked words
1741 checked words
1742 checked words
1743 checked words
1744 checked words
1745 checked words
1746 checked words
1747 checked words
1748 checked words
1749 checked words
1750 checked words
1751 checked words
1752 checked words
1753 checked words
god-awful not in list
1754 checked words
1755 checked words
1756 checked words
1757 checked words
1758 checked words
1759 checked words
1760 checked words
1761 checked words
1762 checked words
1763 checked words
1764 checked words
1765 checked words
1766 checked words
1767 checked words
1768 checked words
176

2105 checked words
incapably not in list
2106 checked words
2107 checked words
2108 checked words
2109 checked words
2110 checked words
2111 checked words
2112 checked words
2113 checked words
2114 checked words
2115 checked words
2116 checked words
2117 checked words
2118 checked words
2119 checked words
2120 checked words
2121 checked words
incompatability not in list
2122 checked words
2123 checked words
2124 checked words
2125 checked words
2126 checked words
2127 checked words
2128 checked words
incomprehensible not in list
2129 checked words
2130 checked words
2131 checked words
2132 checked words
2133 checked words
2134 checked words
2135 checked words
inconsequentially not in list
inconsequently not in list
2136 checked words
2137 checked words
2138 checked words
2139 checked words
2140 checked words
2141 checked words
2142 checked words
2143 checked words
2144 checked words
2145 checked words
2146 checked words
2147 checked words
2148 checked words
2149 checked words
2150 chec

2499 checked words
2500 checked words
2501 checked words
2502 checked words
2503 checked words
2504 checked words
2505 checked words
2506 checked words
2507 checked words
2508 checked words
2509 checked words
2510 checked words
2511 checked words
2512 checked words
left-leaning not in list
2513 checked words
2514 checked words
less-developed not in list
lesser-known not in list
2515 checked words
2516 checked words
2517 checked words
2518 checked words
2519 checked words
2520 checked words
2521 checked words
2522 checked words
2523 checked words
2524 checked words
2525 checked words
2526 checked words
2527 checked words
2528 checked words
2529 checked words
2530 checked words
2531 checked words
2532 checked words
life-threatening not in list
2533 checked words
2534 checked words
2535 checked words
2536 checked words
2537 checked words
2538 checked words
2539 checked words
2540 checked words
2541 checked words
little-known not in list
2542 checked words
lividly not in list
2543 checked 

2900 checked words
2901 checked words
2902 checked words
2903 checked words
2904 checked words
2905 checked words
2906 checked words
2907 checked words
one-sided not in list
2908 checked words
onerously not in list
2909 checked words
2910 checked words
2911 checked words
2912 checked words
2913 checked words
2914 checked words
2915 checked words
2916 checked words
2917 checked words
2918 checked words
2919 checked words
2920 checked words
2921 checked words
2922 checked words
2923 checked words
2924 checked words
2925 checked words
2926 checked words
2927 checked words
2928 checked words
2929 checked words
2930 checked words
2931 checked words
2932 checked words
2933 checked words
2934 checked words
2935 checked words
2936 checked words
2937 checked words
2938 checked words
over-acted not in list
over-awe not in list
over-balanced not in list
over-hyped not in list
over-priced not in list
over-valuation not in list
2939 checked words
2940 checked words
2941 checked words
2942 checked w

3310 checked words
3311 checked words
3312 checked words
3313 checked words
3314 checked words
3315 checked words
repugn not in list
3316 checked words
3317 checked words
repugnantly not in list
3318 checked words
3319 checked words
3320 checked words
3321 checked words
3322 checked words
3323 checked words
3324 checked words
3325 checked words
3326 checked words
3327 checked words
3328 checked words
3329 checked words
3330 checked words
3331 checked words
3332 checked words
3333 checked words
3334 checked words
3335 checked words
3336 checked words
3337 checked words
3338 checked words
3339 checked words
3340 checked words
retardedness not in list
3341 checked words
3342 checked words
3343 checked words
3344 checked words
3345 checked words
3346 checked words
3347 checked words
revengefully not in list
3348 checked words
3349 checked words
3350 checked words
3351 checked words
3352 checked words
3353 checked words
3354 checked words
3355 checked words
revulsive not in list
3356 checke

3693 checked words
3694 checked words
3695 checked words
3696 checked words
3697 checked words
3698 checked words
3699 checked words
3700 checked words
3701 checked words
3702 checked words
3703 checked words
3704 checked words
3705 checked words
3706 checked words
3707 checked words
3708 checked words
spoilages not in list
3709 checked words
spoilled not in list
3710 checked words
3711 checked words
3712 checked words
3713 checked words
3714 checked words
3715 checked words
spoon-fed not in list
spoon-feed not in list
3716 checked words
3717 checked words
3718 checked words
3719 checked words
3720 checked words
3721 checked words
3722 checked words
3723 checked words
3724 checked words
3725 checked words
3726 checked words
3727 checked words
3728 checked words
3729 checked words
3730 checked words
3731 checked words
3732 checked words
3733 checked words
3734 checked words
3735 checked words
3736 checked words
3737 checked words
3738 checked words
3739 checked words
3740 checked words


4100 checked words
uncomfy not in list
4101 checked words
4102 checked words
uncompromisingly not in list
4103 checked words
unconstitutional not in list
4104 checked words
4105 checked words
4106 checked words
4107 checked words
4108 checked words
4109 checked words
4110 checked words
4111 checked words
undependability not in list
4112 checked words
4113 checked words
4114 checked words
4115 checked words
4116 checked words
4117 checked words
4118 checked words
4119 checked words
4120 checked words
4121 checked words
4122 checked words
4123 checked words
4124 checked words
4125 checked words
4126 checked words
4127 checked words
4128 checked words
4129 checked words
4130 checked words
4131 checked words
4132 checked words
4133 checked words
4134 checked words
4135 checked words
4136 checked words
4137 checked words
4138 checked words
4139 checked words
4140 checked words
4141 checked words
4142 checked words
4143 checked words
4144 checked words
4145 checked words
4146 checked words
4