In [17]:
import pandas as pd
import numpy as np

In [18]:
def load_glove_words(filename):
    """Load a file containing a list of words as a python list
    use case: data/words.txt
    :param str filename: path/name to file to load
    :rtype: list
    """

    print("Loading Glove Model")
    f = open(filename,'r', encoding='utf8')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    return gloveModel

In [60]:
from approject.cosinesimilarity import cosine_similarity

In [116]:
import hashlib

In [131]:
!ls data/results

3b5842d322-embeddings.pkl
3b5842d322-embeddings.pkl-luigi-tmp-9210656682
article_title
e3e0fcca75-embeddings.pkl
e3e0fcca75-embeddings.pkl-luigi-tmp-7912043872
text


In [133]:
pd.read_pickle('data/results/article_title/top_10/e3e0fcca75-embeddings.pkl')

Unnamed: 0,article_title,article_date,text
151,"No debate, no download: why I won't be taking ...",April 27 2020,COVIDSafe is built on legally shaky foundation...
65,Illinois Stay-at-Home Order Modified and Exten...,April 27 2020,Illinois has been under a “Stay-at-Home” Execu...
24,[FCRA] A Bridge Too Far: Ninth Circuit Rejects...,April 28 2020,"Last week, in Luna v. Hansen & Adkins Auto Tra..."
127,What Expenses Can You Claim For Your Home Office?,April 27 2020,"Among the many impacts of COVID-19, non-essent..."
83,What You Should Know About COVID-19 and the AD...,April 28 2020,"The EEOC recently updated its guidance, What Y..."
9,U.S. Department of Labor Updates Q&A on the Fa...,March 31 2020,"Last week, we published a client alert on the ..."
108,Episode 313: Is the international law of cyber...,April 28 2020,Click here to listen to the audio.\nIn today’s...
40,COVID-19: Read This Before You Take the Temper...,April 28 2020,"You want to reopen your place of business, and..."
93,"To Record or Not To Record, That is the Questi...",April 28 2020,The federal Occupational Safety and Health Act...
34,U.S. Department of Labor Issues Q&A on the Fam...,March 25 2020,"Last week, we published a client alert on the ..."


In [105]:
cos_similarity = embedings.apply(lambda x: cosine_similarity(emb.embed_document(raw.article_title[0]), x))
distances = 1 - cos_similarity

In [114]:
raw.iloc[distances.nsmallest(5).index]

Unnamed: 0,article_title,article_date,text
0,Open for business: how 'essential' businesses ...,April 29 2020,Introduction\nMost states have issued some for...
56,Georgia Allows Most Businesses to Reopen to th...,April 28 2020,Reopening the doors of your business can also ...
51,What Businesses Can Do to Ease the Transition ...,April 28 2020,As governments start easing stay-at-home order...
95,Developing Leave Policies to Keep Up with the ...,April 17 2020,Many employers that did not previously have a ...
58,America Reopens: What Employers Need To Be Thi...,April 20 2020,"On April 16, 2020, President Trump unveiled br..."


In [95]:
embedings = pd.read_pickle('data/clean/article_title-embeddings.pkl')

In [86]:
raw = pd.read_pickle('data/raw/raw_articles.pkl')

In [89]:
raw.iloc[11].article_title    

'Class Notice Interference On The Defense: Court Penalizes Defendants And Attorney\r Blog  Workplace Class Action Blog'

In [94]:
emb = WordEmbedding(a)
emb.embed_document(raw.iloc[11].article_title)

array([ -2.86015   ,  -0.85998   ,  -2.370395  ,   2.525222  ,
         0.730721  ,   2.432129  ,  -0.627184  ,  -0.638371  ,
        -1.18056123,  -2.43121   ,  -2.981668  ,   2.0320584 ,
        -5.16771   ,  -0.2786595 ,   4.5014385 ,  -3.755069  ,
        -2.360868  ,  -2.77337   ,  -0.1688858 ,  -4.079119  ,
         3.592811  ,   3.063085  ,   0.13261   ,   3.026829  ,
        -2.9788621 , -23.66595   ,  -0.16837   ,  -4.702991  ,
        -1.1091335 ,  -2.338515  ,  36.99166   ,  -1.83837   ,
        -4.191136  , -10.13997   ,  -0.66598807,  -1.1639621 ,
         3.540493  ,  -3.6271557 ,  -3.030733  ,  -0.48614   ,
         1.135703  ,   1.5535399 ,   4.2336233 ,   7.533709  ,
        -1.268103  ,  -2.5334956 ,  -3.4231041 ,   3.126377  ,
         2.265587  ,   4.805361  ])

In [92]:
import numpy as np 
import re



### fix for new data 

class WordEmbedding(object):
    def __init__(self, words):
        # Initializeding with the word list
        self.words = list(words.keys())
        # Initializeding with the vectors
        self.vecs = np.array(list(words.values()))

    def __call__(self, word):
        """Embed a word

        :returns: vector, or None if the word is outside of the vocabulary
        :rtype: ndarray
        """
        
        try:
            # Checking the index of the word list
            idx = self.words.index(word)
            # Return the vector of words in the list
            return self.vecs[idx,:]
            # Raise ValueError if word is outside of list
        except ValueError:
            return None


    @classmethod
    def from_files(cls, word_file):
        """Instantiate an embedding from files

        Example::

            embedding = WordEmbedding.from_files('words.txt', 'vecs.npy.gz')

        :rtype: cls
        """
        # Applying the class to the text file and verctors file
        return cls(load_glove_words(word_file))
    
    
    def tokenize(self, sentence):
        """
        :params sentence_list: list of strings
        :returns tok_sentences: list of list of tokens
        """
        
        tok_sentences = re.findall(r"[\w]+[']*[\w]+|[\w]+|[.,!?;]", sentence.lower() ) 
                           
        return tok_sentences

    def embed_document(self, text):
        """Convert text to vector, by finding vectors for each word and combining

        :param str document: the document (one or more words) to get a vector
            representation for

        :return: vector representation of document
        :rtype: ndarray (1D)
        """
        # Applying the tokenize function to the text
        text = self.tokenize(text)
        # Mapping the text 
        vec = map(self.__call__, text)
        # Converting text to word
        return np.sum([i for i in vec if i is not None], axis=0)
  

In [19]:

emb_path = 'data/glove.6B.50d.txt'
a = load_glove_words(emb_path)

Loading Glove Model


In [22]:
a.get('the')

0.24968