In [81]:
%%time
from allennlp.commands.elmo import ElmoEmbedder
import boto3
import pandas as pd
from datetime import datetime, timedelta
from io import BytesIO
from autolocal.parsers.nlp import Tokenizer
import pickle
import numpy as np
from  tqdm import tqdm

CPU times: user 38 µs, sys: 1 µs, total: 39 µs
Wall time: 43.9 µs


In [None]:
%%time
elmo = ElmoEmbedder()

In [119]:
s3 = boto3.resource('s3')
autolocal_docs_bucket = s3.Bucket('autolocal-documents')
def read_doc(s3_path):
    try:
        return autolocal_docs_bucket.Object(s3_path).get()['Body'].read().decode("ascii", "ignore")
    except:
        return None

In [97]:
def read_metadata():
    table = boto3.resource('dynamodb', region_name='us-west-1').Table('autolocal-documents')
    s3_client = boto3.client('s3')
    metadata = pd.DataFrame(table.scan()["Items"])
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    metadata['local_path_npy'] = metadata['local_path_txt'].apply(lambda x: x[:-3]+"npy")
    return metadata
metadata = read_metadata()

In [71]:
def read_npy(s3_path):
    obj = s3.get_object(Bucket='autolocal-documents', Key=s3_path)
    array = np.load(BytesIO(obj['Body'].read()))
    return array
    
def write_npy(array, s3_path):
    np.save('tmp.npy', array)
    autolocal_docs_bucket.put_object('tmp.npy', Key=s3_path)

In [38]:
simple_tokenizer = Tokenizer()

In [98]:
starting_dates_for_filtering = {
    'upcoming_only': datetime.now() + timedelta(days=0.5),
    'upcoming': datetime.now() + timedelta(days=0.5),
    'this_week': datetime.now() - timedelta(weeks=1),
    'this_year': datetime.now() - timedelta(days=365),
    'this_month': datetime.now() - timedelta(weeks=5),
    'past_six_months':datetime.now() - timedelta(days=183),
    'past': None,
    'all': None
}

In [101]:
metadata_this_month = metadata[metadata["date"] >= starting_dates_for_filtering['this_month']]

(38, 10)

In [122]:
count = 0
for i, row in tqdm(metadata_this_month.iterrows()):
    if count >= 15:
        txt_filename = row['local_path_txt']
        npy_filename = row['local_path_npy']
        doc_string = read_doc(txt_filename)
        if doc_string:
            doc_tokens = simple_tokenizer.tokenize(doc_string)
            doc_vectors = elmo.embed_sentence(doc_tokens)
            write_npy(doc_vectors, npy_filename)
            print(i)
    count += 1

16it [00:00, 85.49it/s]

792
921


19it [01:35,  9.58s/it]

961


20it [01:55, 12.70s/it]

972


21it [02:28, 18.56s/it]

1134


22it [07:19, 100.41s/it]

1177


24it [07:50, 74.95s/it] 

1411


25it [08:07, 57.68s/it]

1422


26it [08:19, 43.70s/it]

1479


27it [09:40, 55.17s/it]

1505


28it [10:04, 45.66s/it]

1569


29it [10:14, 35.01s/it]

1710


30it [11:00, 38.13s/it]

1911


31it [11:55, 43.39s/it]

1953


32it [12:17, 36.79s/it]

1965


33it [12:25, 28.16s/it]

1969


34it [12:26, 20.15s/it]

2021


35it [12:40, 18.33s/it]

2047


36it [12:42, 13.38s/it]

2156


37it [13:17, 19.87s/it]

2273


38it [13:40, 21.59s/it]

2398





In [3]:

tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
vectors = elmo.embed_sentence(tokens)
# dims: (LAYERS(3), TOKENS(6), DIMENSIONS(1024))
"""
https://towardsdatascience.com/elmo-helps-to-further-improve-your-word-embeddings-c6ed2c9df95f
In the ELMo paper, there are 3 layers of word embedding,
layer zero is the character-based context independent layer,
followed by two Bi-LSTM layers. The authors have empirically
shown that the word vectors generated from the first Bi-LSTM
layer can better capture the syntax, and the second layer can
capture the semantics better.
"""

100%|██████████| 336/336 [00:00<00:00, 770522.77B/s]
100%|██████████| 374434792/374434792 [00:33<00:00, 11035846.17B/s]


In [55]:
%%time
text_filename = metadata['local_path_txt'][0]
npy_filename = metadata['local_path_npy'][0]
doc_tokens = simple_tokenizer.tokenize(read_doc(text_filename))
doc_vectors = elmo.embed_sentence(doc_tokens)

CPU times: user 57.2 s, sys: 1.03 s, total: 58.3 s
Wall time: 32.3 s


In [74]:
write_npy(doc_vectors, npy_filename)

In [73]:
text_filename

'docs/cupertino/Cupertino_2019-07-30_Planning-Commission_Agenda.txt'

In [None]:
class DocTextReader():
    def __init__(self, log_every=100):
        self.log_every = log_every
        s3 = boto3.resource('s3', region_name='us-west-1')
        self.bucket = s3.Bucket('autolocal-documents')

    def read_document_string(self, s3_path):
        return self.

    def read_docs(self, s3_paths):
        # read all documents that we know about
        # tokenize each document
        # return list of documents

        documents = {}
        n_docs_total = len(s3_paths)

        i = 0
        n_docs_read = 0
        for s3_path in s3_paths:
            try:
                doc_string = self.read_document_string(s3_path)
                doc_tokens = preprocess_string(doc_string, filters=preprocess_filters)
                documents[s3_path] = {
                    "original_text": doc_string,
                    "tokens": doc_tokens
                }
            except Exception as e:
                if i < 10:
                    print("Key not found: {}".format(s3_path))
                elif i == 10:
                    print("More than 10 keys not found")
                    print(e)
                    break
                i+=1
            if n_docs_read % self.log_every == 0:
                print("{} of {} documents read".format(n_docs_read, n_docs_total))
            n_docs_read+=1

        return documents