In [75]:
import warnings
warnings.filterwarnings("ignore")
from gensim.models import KeyedVectors, FastText
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pickle
import operator

# import natural language toolkit
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize

import string

In [76]:
from modules.library.postgresql import PostgresQL
from modules.library.word_models import WordModels
from modules.library.document_similarity import DocumentSimilarity
from modules.library.document_models import DocumentModels

# Accessing the database

Using the module PostgresQL we will load the documents from our database and store them in a list 'documents'.

In [77]:
# connect to the database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass")

In [78]:
documents = pg.execute("""
    SELECT * FROM documents;
""")

# Word embeddnig model

In this section we will show how to use the module WordModels to load or train a word embedding model. In order to avoid runnig a time consuming commands, we will comment them out and use pickle instead to store and access already loaded or trained models. 

## Training a word embedding model

In case we want to use pre-trained models to train another model on our data, we can use a method in module WordModels called 'train' in the following way:

In [79]:
texts = ["dogs like chasing cats", "cats like chasing mice", "mice eat cheese", "cheese has holes", "earth flat", "nobody knows working"]

sample_model = WordModels()
sample_model.train(texts, size=300, window=1, min_count=1, epochs=10) # texts is a list of stripped strings

In [80]:
sample_model.embedding.vocab.keys()

dict_keys(['d', 'o', 'g', 's', ' ', 'l', 'i', 'k', 'e', 'c', 'h', 'a', 'n', 't', 'm', 'r', 'f', 'b', 'y', 'w'])

In [81]:
type(sample_model)

modules.library.word_models.WordModels

In [82]:
type(sample_model.embedding)

gensim.models.keyedvectors.FastTextKeyedVectors

## Loading a word embedding model

In case model has already been trained either by the user or has been provided from another source, module WordModels enables us to load it and use it.

In [83]:
# wiki_en_path = '../data/fasttext/wiki.en.align.vec'
# wiki_en_model = WordModels()
# wiki_en_model.load(wiki_en_path)

In [84]:
# with open("wwe.pkl", "wb") as f:
#     pickle.dump(wv_wiki_en, f, protocol=-1)

In [85]:
# with open("wwe.pkl", "rb") as f:
#     wiki_en_model = pickle.load(f)

## Embedding documents

We have some documents saved in a list 'documents'. Let's embed them using DocumentModels module.

In [86]:
document_texts = [doc['document_text'] for doc in documents]

In [87]:
stop_words = stopwords.words('english') + list(string.punctuation)
document_model = DocumentModels(wiki_en_model.embedding, document_texts, stop_words)

In [105]:
document_model.embed_documents()

In [111]:
document_model.embedding[0][:5]

array([-0.01304537, -0.00972957, -0.04939726,  0.02375953, -0.03552775],
      dtype=float32)

In [112]:
document_model.remove_documents([document_model.documents[0]])

In [113]:
document_model.embedding[0][:5]

array([-0.02488105, -0.00911847, -0.0197291 ,  0.0392039 , -0.01784342],
      dtype=float32)

In [114]:
document_model.add_documents([document_texts[0]])

In [115]:
document_model.embedding[len(document_model.embedding)-1][:5]

array([-0.01304537, -0.00972957, -0.04939726,  0.02375953, -0.03552775],
      dtype=float32)

## Document Similarity Analysis

In order to do analysis on a corpus of documents we will use module 'DocumentSimilarity'. Below are some examples of use:

In [94]:
ds = DocumentSimilarity(document_model.embedding)

In [95]:
ds.euclid_similarity(document_model.embedding[0], document_model.embedding[1])

0.28714463

In [96]:
list_of_neighbors = ds.k_nearest_neighbors(document_model.embedding[0], 10, ds.euclid_similarity)

In [97]:
list_of_neighbors

[0, 123156, 53428, 43728, 45106, 48419, 50157, 52568, 47395, 53516]