# Word2Vec

Word2vec produces one vector per word, whereas BoW produces one number (a wordcount). Word2vec is great for digging into documents and identifying content and subsets of content.

In [3]:
import pandas as pd

articles = pd.read_excel("data/OpArticles.xlsx")

In [None]:
!pip install gensim

In [6]:
import re
documents = [re.sub("[^a-zA-Z]", " ", body.lower()) for body in articles["body"]]

## Pre-trained embeddings


In [1]:
from gensim.models import KeyedVectors

model_pt = KeyedVectors.load_word2vec_format('./word_vectors/skip_s100.txt')

In [2]:
model_pt.most_similar(positive=["cão"])

[('cachorro', 0.8545767068862915),
 ('gato', 0.8402905464172363),
 ('monstro', 0.8336718082427979),
 ('pássaro', 0.8299859166145325),
 ('ogro', 0.8261020183563232),
 ('gorila', 0.819529116153717),
 ('furão', 0.813050389289856),
 ('cãozinho', 0.8127977848052979),
 ('felino', 0.8087738752365112),
 ('filhote', 0.8069430589675903)]

In [19]:
model_pt.most_similar(positive=["rei", "mulher"], negative=["homem"])

[('rainha-consorte', 0.7912216186523438),
 ('primogénita', 0.7738461494445801),
 ('imperatriz-mãe', 0.7646884322166443),
 ('paleóloga', 0.752788245677948),
 ('dama-de-companhia', 0.7478024363517761),
 ('consorte', 0.7475903630256653),
 ('princesa-eleitora', 0.7472771406173706),
 ('piroska', 0.7468665838241577),
 ('ulrica', 0.7454056143760681),
 ('ranavalona', 0.7441917657852173)]

### Loading text spans dataset

In [4]:
dataset = pd.read_excel('data/OpArticles_ADUs.xlsx')

## Cleanup

In [23]:
import pt_core_news_sm
nlp = pt_core_news_sm.load()
corpus = []

for i in range(0, dataset['tokens'].size):
    # get review, remove non alpha chars and convert to lower-case
    review = re.sub('[^a-zA-Z]', ' ', dataset['tokens'][i]).lower()
    review = ' '.join([word.lemma_.lower().strip() + word.pos_ for word in nlp(review)])
    # add review to corpus
    corpus.append(review)

## Fixing the length of the input
The reviews in our corpus have variable length. However, we need to represent them with a fixed-length vector of features. One way to do it is to impose a limit on the number of word embeddings we want to include.

To convert words into their vector representations (embeddings), let's create an auxiliary function that takes in the number of embeddings we wish to include in the representation:

In [21]:
import numpy as np

def text_to_vector(embeddings, text, sequence_len):
    
    # split text into tokens
    tokens = text.split()
    
    # convert tokens to embedding vectors, up to sequence_len tokens
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:   # while there are tokens and did not reach desired sequence length
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True   # simply ignore out-of-vocabulary tokens
        finally:
            i += 1
    
    # add blanks up to sequence_len, if needed
    for j in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    
    return vec

In [22]:
import numpy as np
from scipy import stats

lens = [len(c.split()) for c in corpus]
print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))

1 92 16.132174640148122 10.750637692983176 ModeResult(mode=array([10]), count=array([916]))


The average length of the text spans is 16.1 tokens with standard deviation of 10.75.

In [16]:
# convert corpus into dataset with appended embeddings representation
embeddings_corpus = []
for c in corpus:
    embeddings_corpus.append(text_to_vector(model_pt, c, 20))

X = np.array(embeddings_corpus)
y = dataset['label']

print(X.shape, y.shape)

(16743, 2000) (16743,)


In [18]:
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify=y)
max_iterations = 10**6/np.ceil(X_train.shape[0] / 10) #Empirically,the model converges after 10^6/n where n 
                                        # the size of the test set

sgd_model = SGDClassifier(penalty="l2", 
                          max_iter=np.ceil(max_iterations/10), average=True)
sgd_gs = GridSearchCV(sgd_model, n_jobs=-1, param_grid={}, cv=10, scoring="accuracy", verbose = 1)
sgd_gs.fit(X_train, y_train)
y_pred = sgd_gs.predict(X_test)

print("\nConfusion matrix:\n", metrics.confusion_matrix(y_test, y_pred))
print("Classification report:\n", metrics.classification_report(y_test, y_pred))

Fitting 10 folds for each of 1 candidates, totalling 10 fits

Confusion matrix:
 [[   0    0  733    0    0]
 [   0    0  133    0    0]
 [   0    0 1621    0    0]
 [   0    0  282    0    0]
 [   0    0  580    0    0]]
Classification report:
               precision    recall  f1-score   support

        Fact       0.00      0.00      0.00       733
      Policy       0.00      0.00      0.00       133
       Value       0.48      1.00      0.65      1621
    Value(+)       0.00      0.00      0.00       282
    Value(-)       0.00      0.00      0.00       580

    accuracy                           0.48      3349
   macro avg       0.10      0.20      0.13      3349
weighted avg       0.23      0.48      0.32      3349



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
