# Word2Vec model with Logistic Regression

In [42]:
import os
import io
import spacy
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from NLPex3 import *
from utils import *

In [12]:
PATH_TO_TRAINING_SET = 'data/train_both_original.txt'
PATH_TO_VALIDATION_SET = 'data/valid_both_original.txt'
PATH_TO_WEIGHTS = 'data/crawl-300d-200k.vec'

In [9]:
my_personae, other_personae, line_indices, utterances, answers = extract_dataset_as_text(PATH_TO_TRAINING_SET, True)

Loaded 17878 dialogues


In [10]:
class PreTrainedWord2Vec:
    def __init__(self, fname: str, nmax=150000, parser='en'):
        self.word2vec = {}
        self.load_wordvec(fname, nmax)
        self.word2id = {w: i for i, w in enumerate(self.word2vec.keys())}
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(list(self.word2vec.values()))
        self.parser = spacy.load(parser)
        self.dimension = -1

    def load_wordvec(self, fname: str, nmax: int):
        """Load the Word2Vec weights of the given file into class variables. Maps each words to an id."""
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as file:
            next(file)
            for i, line in enumerate(file):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        self.dimension = self.word2vec[word].shape[0]
        print('Loaded %s pretrained word vectors of dimension %s' % (len(self.word2vec),  self.dimension))

    @staticmethod
    def sentence_treated(sentence: str) -> str:
        """Change the sentence construction to be more easily manageable and understandable."""
        sentence = sentence.lower()
        sentence = sentence.replace("'s", ' is')
        sentence = sentence.replace("n't", ' not')
        sentence = sentence.replace("'re", ' are')
        sentence = sentence.replace("'m", ' am')
        sentence = sentence.replace("'ve", ' have')
        sentence = sentence.replace("'ll", ' will')
        sentence = sentence.replace("'d", ' would')
        sentence = sentence.replace("-", ' ')
        sentence = sentence.replace("!", ' ! ')
        sentence = sentence.replace(".", ' . ')
        sentence = sentence.replace(":", ' : ')
        return sentence

    def encode_parse(self, sentences: list) -> np.array:
        """
        Takes a list of sentences, outputs a numpy array of sentence embeddings by computing the mean of words vector.
        Also use a parser to keep only important words (adjectives, verbs, common or proper nouns and interjections.
        If a word is unknown, ignore it, if a sentence is completely unknown, attribute a random vector as
        representation.
        """
        sentences_embedded = []
        for sent in sentences:
            sent = self.sentence_treated(sent)
            words_weights = []
            words_embedded = []
            for word in self.parser(sent):
                # Only keep important words, discard others
                if word.pos_ in ['ADJ', 'VERB', 'NOUN', 'PROPN', 'INTJ']:
                    str_word = str(word)
                    # Get embedding vector of each word of the sentence
                    try:
                        words_embedded.append(self.word2vec[str_word])
                        words_weights.append(1)
                    except KeyError:
                        # If word is unknown, ignore it.
                        if len(words_embedded) == len(words_weights) + 1:  # 2 different lists are used
                            words_weights.append(0)
                        continue
            # Average
            if len(words_embedded) > 0:
                sentences_embedded.append(np.average(words_embedded, weights=words_weights, axis=0))
            else:
                sentences_embedded.append(0.2 * np.random.rand(300) - 0.1)
        return np.array(sentences_embedded)

In [14]:
w2v = PreTrainedWord2Vec(PATH_TO_WEIGHTS, 150000)
clf = LogisticRegression(C=1, solver='liblinear', multi_class='ovr')

Loaded 150000 pretrained word vectors of dimension 300


In [56]:
def get_training_matrices(utterances, answers, is_training, nb_neg_examples=3):
    if is_training:
        if os.path.exists('data/X_tr.npy') and os.path.exists('data/Y_tr.npy'):
            return np.load('data/X_tr.npy'), np.load('data/Y_tr.npy')
    else:
        if os.path.exists('data/X_val.npy') and os.path.exists('data/Y_val.npy'):
            return np.load('data/X_val.npy'), np.load('data/Y_val.npy')
    
    X = np.array([])
    Y = np.array([])
    for idx_dialogue in range(len(utterances)):
        for idx_utter in range(len(utterances[idx_dialogue])):
            x_utter = w2v.encode_parse([utterances[idx_dialogue][idx_utter]])
            x_answer = w2v.encode_parse(answers[idx_dialogue][idx_utter][:nb_neg_examples + 1])
            y = np.zeros(len(x_answer))
            y[0] = 1
            if X.size:
                X = np.vstack((X, np.hstack((x_utter * np.ones(x_answer.shape), x_answer))))
                Y = np.hstack((Y, y))
            else:
                X = np.hstack((x_utter * np.ones(x_answer.shape), x_answer))
                Y = y
    
    if is_training:
        np.save('data/X_tr.npy', X)
        np.save('data/Y_tr.npy', Y)
    else:
        np.save('data/X_val.npy', X)
        np.save('data/Y_val.npy', Y)
    
    return X, Y

In [54]:
X_tr, Y_tr = get_training_matrices(utterances[:1000], answers[:1000])

In [55]:
print(X_tr.shape, Y_tr.shape)

(29576, 600) (29576,)


In [57]:
clf.fit(X_tr, Y_tr)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [58]:
from sklearn.metrics import f1_score

In [64]:
f1_score(Y_tr, clf.predict(X_tr))

0.0026939655172413795