In [1]:
import re

from allennlp.commands.elmo import ElmoEmbedder
import boto3
import pandas as pd
from datetime import datetime, timedelta
from io import BytesIO
from autolocal.parsers.nlp import Tokenizer
import pickle
import numpy as np
from  tqdm import tqdm
import os
import sys

elmo = ElmoEmbedder()

In [45]:
def read_doc(s3_path):
    s3 = boto3.resource('s3')
    autolocal_docs_bucket = s3.Bucket('autolocal-documents')
    try:
        return autolocal_docs_bucket.Object(s3_path).get()['Body'].read().decode("ascii", "ignore")
    except:
        return None

In [36]:
def read_metadata():
    table = boto3.resource('dynamodb', region_name='us-west-1').Table('autolocal-documents')
    s3_client = boto3.client('s3')
    
    response = table.scan()
    data = response['Items']

    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        data.extend(response['Items'])

    metadata = pd.DataFrame(data)
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    metadata['local_path_pkl'] = metadata['local_path_txt'].apply(lambda x: "vectors"+x[4:-3]+"pkl")
    return metadata
metadata = read_metadata()
print(metadata["local_path_pkl"][0])

vectors/cupertino/Cupertino_2019-07-30_Planning-Commission_Agenda.pkl


In [41]:
# s3.Object('autolocal-documents', 'cities.csv').load()

In [15]:
def read_local(s3_path):
    return pickle.load(open(os.path.join("../data/pkls/", os.path.basename(s3_path)), 'rb'))

In [18]:
def get_local_pkl(s3_path):
    return os.path.join("../data/pkls/", os.path.basename(s3_path))

In [38]:
metadata_subset = metadata[metadata["date"] >= datetime.strptime('2019-09-01', '%Y-%m-%d')]
metadata_subset = metadata_subset[metadata_subset["city"] == "San Jose"]
print(metadata_subset.shape)

(169, 11)


In [27]:
s3 = boto3.resource('s3')
s3.meta.client.download_file('autolocal-documents', 'cities.csv', 'tmp.txt')

In [50]:

# In[36]:
def write_local(array, s3_path):
    pickle.dump(array, open(os.path.join("../data/pkls/", os.path.basename(s3_path)), 'wb'))


In [47]:

def sentence_split(s):
    sentences = re.split('[.\n!?"\f]', s)
    return [s for s in sentences if len(s.strip())>0]

def tokenize(s):
    tokens = re.findall(r'\w+', s)
    return tokens

In [None]:
# s3_client.upload_file('autolocal-documents', , )

In [52]:
print("processing docs")
for i, row in tqdm(metadata_subset.iterrows()):
    txt_filename = row['local_path_txt']
    pkl_filename = row['local_path_pkl']
    try:
        s3.Object('autolocal-documents', pkl_filename).load()
        print("already uploaded: {}".format(pkl_filename))
    except:
        try:
            read_local(pkl_filename)
            print("already processed: {}".format(pkl_filename))
        except:
            print("processing doc")
            doc_string = read_doc(txt_filename)
            if doc_string:
                sentences = sentence_split(doc_string)
                vectors = []
                for sentence in sentences:
                    sentence_tokens = tokenize(sentence)
                    sentence_vectors = elmo.embed_sentence(sentence_tokens)
                    vectors.append(sentence_vectors)
                write_local({"sentences": sentences, "vectors": vectors}, pkl_filename)
        print("uploading doc")
        s3 = boto3.resource('s3')
        try:
            read_local(pkl_filename)
            s3.meta.client.upload_file(get_local_pkl(pkl_filename), 'autolocal-documents', pkl_filename)
        except:
            pass

0it [00:00, ?it/s]

processing docs


5it [00:00,  5.26it/s]

already uploaded: vectors/san-jose/San-Jose_2019-11-07_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-18_Library-And-Early-Education-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-04_Planning-Director'S-Hearing_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-02_Historic-Landmarks-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-09_Joint-Meeting-For-The-Rules-And-Open-Government-Committee-And-Committee-Of-The-Whole_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-24_City-Council_Minutes.pkl


11it [00:00,  8.89it/s]

already uploaded: vectors/san-jose/San-Jose_2019-10-23_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-23_Joint-Meeting-For-The-Rules-And-Open-Government-Committee-And-Committee-Of-The-Whole_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-10_Clean-Energy-Community-Advisory-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-12_Public-Safety,-Finance-And-Strategic-Support-Committee-(Psfss)_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-06_Joint-Meeting-For-The-Rules-And-Open-Government-Committee-And-Committee-Of-The-Whole_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-03_Civil-Service-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-22_City-Council_Minutes.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-09_Board-Of-Fair-Campaign-And-Political-Practices-(Bfcpp)_Agenda.pkl


18it [00:00, 14.02it/s]

already uploaded: vectors/san-jose/San-Jose_2019-11-06_Neighborhoods-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-22_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-11_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-13_Joint-Meeting-For-The-Rules-And-Open-Government-Committee-And-Committee-Of-The-Whole_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-13_Planning-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-19_Public-Safety,-Finance-And-Strategic-Support-Committee-(Psfss)_Agenda.pkl


25it [00:01, 17.68it/s]

already uploaded: vectors/san-jose/San-Jose_2019-11-21_Human-Services-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-11_Historic-Landmarks-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-28_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-27_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-04_Miscellaneous-Agendas_Agenda.pkl


29it [00:01, 20.90it/s]

already uploaded: vectors/san-jose/San-Jose_2019-09-25_Planning-Director'S-Hearing_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-23_Community-&-Economic-Development-Committee-(Ced)_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-17_Human-Services-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-03_Smart-Cities-And-Service-Improvements-Committee_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-26_Joint-Meeting-For-The-Rules-And-Open-Government-Committee-And-Committee-Of-The-Whole_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-12_Appeals-Hearing-Board_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-18_Miscellaneous-Agendas_Agenda.pkl


36it [00:01, 22.11it/s]

already uploaded: vectors/san-jose/San-Jose_2019-09-11_Planning-Director'S-Hearing_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-17_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-22_Planning-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-10_City-Council_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-23_Planning-Commission_Agenda.pkl


42it [00:01, 24.38it/s]

already uploaded: vectors/san-jose/San-Jose_2019-10-28_Community-&-Economic-Development-Committee-(Ced)_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-12-16_Youth-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-20_Library-And-Early-Education-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-05_Clean-Energy-Community-Advisory-Commission_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-21_Public-Safety,-Finance-And-Strategic-Support-Committee-(Psfss)_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-11-04_Transportation-And-Environment-Committee-(T&E)_Agenda.pkl


45it [00:01, 25.30it/s]

already uploaded: vectors/san-jose/San-Jose_2019-09-19_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-09_Planning-Director'S-Hearing_Agenda.pkl
processing doc
uploading doc
processing doc
uploading doc
processing doc
uploading doc


48it [01:31,  8.97s/it]

processing doc
uploading doc


49it [03:00, 33.01s/it]

processing doc
uploading doc


50it [03:54, 39.32s/it]

processing doc
uploading doc


51it [04:20, 35.40s/it]

processing doc
uploading doc


52it [05:56, 53.46s/it]

processing doc
uploading doc


53it [10:01, 110.94s/it]

processing doc
uploading doc


54it [19:27, 247.38s/it]

processing doc
uploading doc


55it [20:58, 200.71s/it]

processing doc
uploading doc


56it [20:59, 140.77s/it]

processing doc
uploading doc


57it [22:15, 121.16s/it]

processing doc
uploading doc


58it [23:49, 113.21s/it]

processing doc
uploading doc


59it [24:51, 97.71s/it] 

processing doc
uploading doc


60it [26:08, 91.56s/it]

processing doc
uploading doc


61it [28:41, 109.96s/it]

processing doc
uploading doc


62it [29:11, 86.02s/it] 

processing doc
uploading doc


63it [30:52, 90.46s/it]

processing doc
uploading doc


64it [31:46, 79.44s/it]

already uploaded: vectors/san-jose/San-Jose_2019-11-13_Miscellaneous-Agendas_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-09-18_Planning-Director'S-Hearing_Agenda.pkl
already uploaded: vectors/san-jose/San-Jose_2019-10-31_Miscellaneous-Agendas_Agenda.pkl
processing doc


64it [33:41, 31.59s/it]


KeyboardInterrupt: 