In [107]:
from autolocal.aws import aws_config
import boto3
import pandas as pd
from datetime import datetime, timedelta
import re
import os
import pickle
from allennlp.commands.elmo import ElmoEmbedder
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import editdistance

In [91]:
elmo = ElmoEmbedder()

In [13]:
def read_metadata():
    # start_date = [int(d) for d in args.start_date.split("-")]
    # end_date = [int(d) for d in args.end_date.split("-")]
    # for year in range(start_date[0], end_date[0]):
    #     for month in range(start_date[1], end_date[1]):
    #         for day in range(start_date[2], end_date[2]):
    table = boto3.resource(
        'dynamodb',
        region_name=aws_config.region_name,
        ).Table(aws_config.db_document_table_name)
    s3_client = boto3.client(
        's3',
        region_name=aws_config.region_name
        )

    response = table.scan()
    data = response['Items']

    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        data.extend(response['Items'])

    metadata = pd.DataFrame(data)
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    metadata['local_path_pkl'] = metadata['local_path_txt'].apply(lambda x: "vectors"+x[4:-3]+"pkl")
    return metadata

In [15]:
metadata = read_metadata()

In [133]:
def find_relevant_filenames(queries, metadata, start_date=None, end_date=None, agenda_only=False):

    cities = set()
    
    # filter metadata to only those files that match the query municipality and time_window
    for query in queries:
        cities.update(query["municipalities"])

    potential_documents = metadata

#     print("start", start_date)
#     print("end", end_date)
    potential_documents = potential_documents[potential_documents["date"] >= start_date]
    if end_date:
        potential_documents = potential_documents[potential_documents["date"] <= end_date]

    potential_documents = potential_documents[[(c in cities) for c in potential_documents["city"]]]

    if agenda_only:
        potential_documents = potential_documents[potential_documents["doc_type"]=="Agenda"]

    return list(potential_documents['local_path_txt'])

In [58]:
def sentence_split(s):
    sentences = re.split('[.\n!?"\f]', s)
    return [s for s in sentences if len(s.strip())>0]

def tokenize(s):
    tokens = re.findall(r'\w+', s)
    return tokens

In [66]:
def get_local_pkl_path(s3_path):
    return os.path.join("../data/pkls/", os.path.basename(s3_path))

In [64]:
def read_vectors(pkl_filename):
    local_pkl_path = get_local_pkl_path(pkl_filename)
    try:
        return pickle.load(open(local_pkl_path, 'rb'))
    except Exception as e:
        print("failed to load local vectors")
        print(e)
        s3 = boto3.resource('s3')
        s3.meta.client.download_file(
            aws_config.s3_document_bucket_name,
            pkl_filename,
            local_pkl_path)
        return pickle.load(open(local_pkl_path, 'rb'))

In [59]:
def read_doc(s3_path):
    s3 = boto3.resource(
        's3',
        region_name=aws_config.region_name
        )
    autolocal_docs_bucket = s3.Bucket(
        aws_config.s3_document_bucket_name
        )
    try:
        return autolocal_docs_bucket.Object(s3_path).get()['Body'].read().decode("ascii", "ignore")
    except:
        return None

In [132]:
def read_docs(s3_paths):
    log_every = 100

    documents = {}
    n_docs_total = len(s3_paths)

    i = 0
    n_docs_read = 0
    for s3_path in s3_paths:
        pkl_path = "vectors" + s3_path[4:-3] + "pkl"
        try:
            doc_string = read_doc(s3_path)
            if doc_string:
                doc_sentences = sentence_split(doc_string)
                doc_tokens = []
                for sentence in doc_sentences:
                    sentence_tokens = tokenize(sentence)
                    doc_tokens.append(sentence_tokens)
                try:
                    vectors = read_vectors(pkl_path)
                    documents[s3_path] = {
                        "original_text": doc_string,
                        "sentences": doc_sentences,
                        "vectors": vectors
                    }
                except Exception as e:
                    print('missing vectors for: {}'.format(pkl_path))
                    print(e)
        except Exception as e:
            if i < 10:
                print("Key not found in S3: {}".format(s3_path))
                print(e)
            elif i == 10:
                print("More than 10 keys not found")
                print(e)
                break
            i+=1
#         if n_docs_read % log_every == 0:
#             print("{} of {} documents read".format(n_docs_read, n_docs_total))
        n_docs_read+=1

    return documents

In [93]:
def select_relevant_docs(municipalities, all_docs, metadata, start_date=None, end_date=None):
    # filter metadata to only those files that match the query municipality and time_window
    potential_documents = metadata
    if start_date:
        potential_documents = potential_documents[potential_documents["date"] >= start_date]
    if end_date:
        potential_documents = potential_documents[potential_documents["date"] >= end_date]
    potential_documents = potential_documents[[(c in municipalities) for c in potential_documents["city"]]]
    # filter all docs to only filenames in subset of metadata
    filenames = list(potential_documents['local_path_txt'])
    urls = list(potential_documents['url'])
    docs_to_return = []
    for i in range(len(filenames)):
        f = filenames[i]
        u = urls[i]
        if f in all_docs:
            docs_to_return.append({**all_docs[f], 'filename':f, 'url':u})
    # return [{**all_docs[f], 'filename':f, 'url':"example.com"} for f in potential_documents['local_path_txt'] if f in all_docs]
    return docs_to_return

In [94]:
def segment_docs(relevant_docs):
    min_section_length = 50 # tokens
    # TODO: this cuts of end of doc
    
    sections = []
    for doc in relevant_docs:
        original_text = doc["original_text"]
        pages = original_text.split('\f')
        page_numbers = []
        for p, page in enumerate(pages):
            page_sentences = sentence_split(page)
            # for each sentence, what page was it on?
            for sentence in page_sentences:
                sentence_tokens = tokenize(sentence)
                page_numbers.append(p+1)
        doc_sentences = doc["sentences"]
        doc_sentences_with_extra = doc["vectors"]["sentences"]
        doc_vectors_with_extra = doc["vectors"]["vectors"]
        nonempty_sentence_indices = [i for i in range(len(doc_sentences_with_extra)) if len(doc_sentences_with_extra[i].strip())>0]
        doc_vectors = [doc_vectors_with_extra[i] for i in nonempty_sentence_indices]
        section = []
        section_tokens = 0
        if (len(doc_sentences) == len(doc_vectors)):
            for i in range(len(doc_sentences)):
                sentence = doc_sentences[i]
                page = page_numbers[i]
                sentence_vectors = doc_vectors[i]
                sentence_tokens = tokenize(sentence)
                section.append({
                    "sentence": sentence,
                    "page": page,
                    "sentence_vectors": sentence_vectors,
                    "sentence_tokens": sentence_tokens
                })
                section_tokens += len(sentence_tokens)
                if section_tokens >= min_section_length:
                    section_text = ". ".join([s["sentence"].strip() for s in section])
                    sections.append({
                        "sentences": section,
                        "section_text": section_text,
                        "filename": doc["filename"],
                        "url": doc["url"]
                    })
                    section = []
                    section_tokens = 0
    return sections

In [135]:
def text_is_too_similar(a, b):
    # if there are only 2 edits to get from one text to the other, it's not good
    return editdistance.eval(a, b) < 50

# make sure we're not giving similar text among the top k (e.g. shows up on both minutes and agenda)
def check_repeated_text(top_k, section_text):
    for old in top_k:
        if text_is_too_similar(old[1]["section_text"], section_text):
            return True
    return False

def select_top_k(doc_sections, doc_sections_scores, k):
    sorted_sections = sorted(zip(doc_sections_scores, doc_sections), key=lambda pair: pair[0], reverse=True)
    top_k = []
    text_returned = []
    for x in sorted_sections:
        score = x[0]
        if score > 0:
            filename = x[1]["filename"]
            starting_page = x[1]["sentences"][0]["page"]
            ending_page = x[1]["sentences"][-1]["page"]
            section_text = x[1]["section_text"]
            if check_repeated_text(top_k, section_text):
                pass
#                 print("this excpert has already been returned")
            else:
                top_k.append(x)
            if len(top_k) >= k:
                break
    return top_k

def update_with_top_k(results, top_k_sections, query):
    for section in top_k_sections:
        x = section[1]
        x.update(query)
        x["start_page"] = x["sentences"][0]["page"]
        x["sentences"] = []
        results.append(x)
    return results

In [115]:
def single_vector_per_doc(vectors):
    # vectors is a list of np arrays where:
    # dims: (LAYERS(3), TOKENS(varies), DIMENSIONS(1024))
    """
    https://towardsdatascience.com/elmo-helps-to-further-improve-your-word-embeddings-c6ed2c9df95f
    In the ELMo paper, there are 3 layers of word embedding,
    layer zero is the character-based context independent layer,
    followed by two Bi-LSTM layers. The authors have empirically
    shown that the word vectors generated from the first Bi-LSTM
    layer can better capture the syntax, and the second layer can
    capture the semantics better.
    """
    vectors = np.concatenate([v[2] for v in vectors], 0)
    return vectors

def score_doc_sections(doc_sections, orig_keywords, elmo):
    orig_keywords = [k.strip() for k in orig_keywords]
    keywords = []
    for k in orig_keywords:
        words = k.split(" ")
        for word in words:
            keywords.append(word)
    keyword_vectors = single_vector_per_doc([elmo.embed_sentence(keywords)])
    # keyword_weights = []
    # fix_case = casing_function()
    # idf_smoothing_count = 10
    # for k in keywords:
    #     words = k.split(" ")
    #     if len(words) > 1:
    #         keyword_weights.append(1./idf_smoothing_count)
    #     else:
    #         k = fix_case(k)
    #         if k in idf:
    #             keyword_weights.append(1./(1./idf[k]+idf_smoothing_count))
    #         else:
    #             keyword_weights.append(1./idf_smoothing_count)
    doc_sections_scores = []
    for s, section in enumerate(doc_sections):
        section_vectors = single_vector_per_doc([s["sentence_vectors"] for s in section["sentences"]])
        section_text = section['section_text']
        no_keywords_found = True
        for k in orig_keywords:
            if bool(re.search("([^\w]|^)" + k + "([^\w]|$)", section_text)):
            # if k in section_text:
            #     TODO: consider casing
                no_keywords_found = False
        for k in orig_keywords:
            if k.islower():
                if bool(re.search("([^\w]|^)" + k + "([^\w]|$)", section_text.lower())):
                    no_keywords_found = False
        if no_keywords_found:
            score = 0
        elif section_vectors.shape[0]>0:
            similarities = cosine_similarity(section_vectors, keyword_vectors)
            # if threshold_similarity > -1:
            #     similarities = similarities*(similarities>threshold_similarity)
            keyword_similarities = np.mean(similarities, axis=0)
            # score = np.sum(keyword_similarities*keyword_weights)
            score = np.mean(keyword_similarities)
        else:
            score = 0
        doc_sections_scores.append(score)

    return doc_sections_scores

In [177]:
k = 5
queries = [
    {
        "keywords": ["housing", "affordable housing", "homelessness", "auxiliary dwelling unit", "ADU"],
        "municipalities": ["San Jose"]
    }
]

def recommender_stats(start_date, end_date):
    n_unique_docs = None
    n_recalled_docs = None
    
    relevant_filenames = find_relevant_filenames(
    queries,
    metadata,
    start_date = start_date,
    end_date = end_date,
    agenda_only=True)
#     print("{} relevant documents found.".format(len(relevant_filenames)))
#     print("reading relevant documents")
    all_docs = read_docs(relevant_filenames)
#     print("{} documents read".format(len(all_docs)))
    
    results = []
    for q, query in enumerate(queries):
#         print("running query {} of {}".format(q, len(queries)))
    #     user_id = query["id"]
    #     email_address = query["email_address"]
    #     print("email address: {}".format(email_address))
        keywords = query["keywords"]
#         print("keywords: {}".format(keywords))
        municipalities = query["municipalities"]
#         print("municipalities: {}".format(municipalities))
        relevant_docs = select_relevant_docs(municipalities, all_docs, metadata)
#         print("{} relevant documents identified for this query".format(len(relevant_docs)))
#         print("segmenting documents")
        doc_sections = segment_docs(relevant_docs)
#         print("{} document sections to choose from".format(len(doc_sections)))
#         print("scoring documents")
        doc_sections_scores = score_doc_sections(
            doc_sections,
            keywords,
            elmo
        )
        top_k_sections = select_top_k(doc_sections, doc_sections_scores, k)
        results = update_with_top_k(results, top_k_sections, query)
#         for r in results:
#             print("~~~")
#             print(r["section_text"])
#             print("~~~")
#         print("~~~~~~~~~~~~~~~")
        
        relevant_filenames_for_emily = [
            "docs/san-jose/San-Jose_2019-11-05_City-Council_Agenda.txt",
            "docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt",
            "docs/san-jose/San-Jose_2019-10-01_City-Council_Agenda.txt"
        ]
        documents = set([x["filename"] for x in results])
#         print(documents)
        n_unique_docs =len(documents)
        docs_found = 0
        for filename in relevant_filenames_for_emily:
            if filename in documents:
                print(filename)
                docs_found += 1
        n_recalled_docs = docs_found
    
    return n_unique_docs, n_recalled_docs, results

In [149]:
k = 5
start_date = datetime.strptime("2019-09-02", '%Y-%m-%d')
end_date = start_date + timedelta(weeks=2)
queries = [
    {
#         "keywords": ["housing", "affordable housing", "homelessness", "accessory dwelling unit", "ADU"],
        "keywords": ["affordable housing", "homelessness", "ADU"],
        "municipalities": ["San Jose"]
    }
]
relevant_filenames = find_relevant_filenames(
    queries,
    metadata,
    start_date = start_date,
    end_date = end_date,
    agenda_only=True)
print("{} relevant documents found.".format(len(relevant_filenames)))
print("reading relevant documents")
all_docs = read_docs(relevant_filenames)
print("{} documents read".format(len(all_docs)))

18 relevant documents found.
reading relevant documents
18 documents read


In [150]:
results = []
for q, query in enumerate(queries):
    print("running query {} of {}".format(q, len(queries)))
#     user_id = query["id"]
#     email_address = query["email_address"]
#     print("email address: {}".format(email_address))
    keywords = query["keywords"]
    print("keywords: {}".format(keywords))
    municipalities = query["municipalities"]
    print("municipalities: {}".format(municipalities))
    relevant_docs = select_relevant_docs(municipalities, all_docs, metadata)
    print("{} relevant documents identified for this query".format(len(relevant_docs)))
    print("segmenting documents")
    doc_sections = segment_docs(relevant_docs)
    print("{} document sections to choose from".format(len(doc_sections)))
    print("scoring documents")
    doc_sections_scores = score_doc_sections(
        doc_sections,
        keywords,
        elmo
    )
    top_k_sections = select_top_k(doc_sections, doc_sections_scores, k)
    results = update_with_top_k(results, top_k_sections, query)
    print(results)

running query 0 of 1
keywords: ['housing', 'affordable housing', 'homelessness', 'accessory dwelling unit', 'ADU']
municipalities: ['San Jose']
18 relevant documents identified for this query
segmenting documents
405 document sections to choose from
scoring documents
[{'sentences': [], 'section_text': 'housing opportunities. Neighborhood Services - Serve, foster, and strengthen community by providing access to lifelong. learning and opportunities to enjoy life. Transportation & Aviation Services - A safe and efficient transportation system that contributes to. the livability and economic health of the City; and provide for the air transportation needs of the. community and the region at levels that is acceptable to the community', 'filename': 'docs/san-jose/San-Jose_2019-09-10_City-Council_Agenda.txt', 'url': 'https://sanjose.legistar.com/View.ashx?M=A&ID=717966&GUID=D9B9B02A-F925-487C-9CD4-C874ED77625A', 'keywords': ['housing', 'affordable housing', 'homelessness', 'accessory dwelling

In [151]:
relevant_filenames_for_emily = [
    "docs/san-jose/San-Jose_2019-11-05_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-10-01_City-Council_Agenda.txt"
]
documents = [x["filename"] for x in results]
len(set(documents))
docs_found = 0
for filename in relevant_filenames_for_emily:
    if filename in documents:
        print(filename)
        docs_found += 1
docs_found

0

In [175]:
start_date = datetime.strptime("2019-09-02", '%Y-%m-%d')
dow = "Monday"
last_start_date = datetime.strptime("2019-10-17", '%Y-%m-%d')

emily_story_dates = [datetime.strptime(x, '%Y-%m-%d') for x in [
    "2019-11-05",
    "2019-09-24",
    "2019-10-01"
]]

relevant_filenames_for_emily = [
    "docs/san-jose/San-Jose_2019-11-05_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-10-01_City-Council_Agenda.txt"
]

queries = [
    {
#         "keywords": ["housing", "affordable housing", "homelessness", "accessory dwelling unit", "ADU"],
#         "keywords": ["housing", "affordable housing", "homelessness", "ADU"],
#         "keywords": ["housing", "affordable housing", "homeless", "homelessness" "accessory dwelling unit", "ADU"],
        "keywords": ["housing", "affordable housing", "homelessness", "ADU"],
        "municipalities": ["San Jose"]
    }
]

all_results = []
while start_date <= last_start_date:
    end_date = start_date + timedelta(weeks=2)
    
    story_dates_in_this_time_window = [x for x in emily_story_dates if x >= start_date and x <= end_date]
    num_articles = len(story_dates_in_this_time_window)
    
    subset_metadata = metadata
    subset_metadata = subset_metadata[subset_metadata["date"] >= start_date]
    subset_metadata = subset_metadata[subset_metadata["date"] <= end_date]
    subset_metadata = subset_metadata[subset_metadata["doc_type"] == "Agenda"]
    subset_metadata = subset_metadata[subset_metadata["city"] == "San Jose"]
    committees = set(subset_metadata["committee"])
    
    
    total, found, results = recommender_stats(start_date, end_date)
    all_results.append({
        "results": results,
        "start_date": start_date,
        "end_date": end_date
    })
#     print(committees)
    print(
        "{} & {} & {} & {} & {} \\\\".format(
#         "{} & {} & {} & {} & {} & {} & {} \\\\".format(
#         "start: {}, end: {}, num_agendas: {}, num_articles: {}".format(
            start_date.strftime("%m/%d"),
            end_date.strftime("%m/%d"),
            subset_metadata.shape[0],
            len(committees),
#             num_articles,
#             found,
            total
        )
    )
    
    if dow == "Monday":
        start_date = start_date + timedelta(days=3)
        dow = "Thursday"
    else:
        start_date = start_date - timedelta(days=3) + timedelta(weeks=1)
        dow = "Monday"

09/02 & 09/16 & 18 & 15 & 2 \\
09/05 & 09/19 & 22 & 16 & 2 \\
09/09 & 09/23 & 20 & 15 & 2 \\
docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt
09/12 & 09/26 & 18 & 14 & 2 \\
docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt
09/16 & 09/30 & 17 & 13 & 2 \\
docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt
09/19 & 10/03 & 22 & 17 & 2 \\
docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt
09/23 & 10/07 & 24 & 19 & 2 \\
09/26 & 10/10 & 25 & 18 & 3 \\
09/30 & 10/14 & 25 & 17 & 3 \\
10/03 & 10/17 & 26 & 17 & 5 \\
10/07 & 10/21 & 23 & 14 & 5 \\
10/10 & 10/24 & 20 & 13 & 3 \\
10/14 & 10/28 & 19 & 13 & 2 \\
10/17 & 10/31 & 18 & 11 & 3 \\


In [176]:
all_results[5:7]

[{'results': [{'sentences': [],
    'section_text': 'housing opportunities. Neighborhood Services - Serve, foster, and strengthen community by providing access to lifelong. learning and opportunities to enjoy life. Transportation & Aviation Services - A safe and efficient transportation system that contributes to. the livability and economic health of the City; and provide for the air transportation needs of the. community and the region at levels that is acceptable to the community',
    'filename': 'docs/san-jose/San-Jose_2019-10-01_City-Council_Agenda.txt',
    'url': 'https://sanjose.legistar.com/View.ashx?M=A&ID=709097&GUID=45DA4900-8361-4625-9640-0A7C36E69022',
    'keywords': ['housing', 'affordable housing', 'homelessness', 'ADU'],
    'municipalities': ['San Jose'],
    'start_page': 2},
   {'sentences': [],
    'section_text': 'and Changes to Existing Loan and Grant Terms for the Markham Plaza I. Project. (a) Adopt a resolution:. (1) Authorizing the issuance of (a) tax-exempt

In [173]:
keywords

['housing',
 'affordable housing',
 'homelessness',
 'accessory dwelling unit',
 'ADU']

In [174]:
start_date = datetime.strptime("2019-09-28", '%Y-%m-%d')
end_date = datetime.strptime("2019-10-12", '%Y-%m-%d')

queries = [
    {
#         "keywords": ["housing", "affordable housing", "homelessness", "accessory dwelling unit", "ADU"],
        "keywords": ["housing", "affordable housing", "homelessness", "affordable dwelling unit", "ADU"],
#         "keywords": ["affordable housing", "homelessness", "ADU"],
        "municipalities": ["San Jose"]
    }
]

emily_story_dates = [datetime.strptime(x, '%Y-%m-%d') for x in [
    "2019-11-05",
    "2019-09-24",
    "2019-10-01"
]]

relevant_filenames_for_emily = [
    "docs/san-jose/San-Jose_2019-11-05_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-09-24_City-Council_Agenda.txt",
    "docs/san-jose/San-Jose_2019-10-01_City-Council_Agenda.txt"
]

queries = [
    {
#         "keywords": ["housing", "affordable housing", "homelessness", "accessory dwelling unit", "ADU"],
        "keywords": ["housing", "affordable housing", "homelessness", "ADU"],
#         "keywords": ["affordable housing", "homelessness", "ADU"],
        "municipalities": ["San Jose"]
    }
]

end_date = start_date + timedelta(weeks=2)

story_dates_in_this_time_window = [x for x in emily_story_dates if x >= start_date and x <= end_date]
num_articles = len(story_dates_in_this_time_window)

subset_metadata = metadata
subset_metadata = subset_metadata[subset_metadata["date"] >= start_date]
subset_metadata = subset_metadata[subset_metadata["date"] <= end_date]
subset_metadata = subset_metadata[subset_metadata["doc_type"] == "Agenda"]
subset_metadata = subset_metadata[subset_metadata["city"] == "San Jose"]
committees = set(subset_metadata["committee"])


total, found, results = recommender_stats(start_date, end_date)
print(
    "{} & {} & {} & {} & {} \\\\".format(
#         "{} & {} & {} & {} & {} & {} & {} \\\\".format(
#         "start: {}, end: {}, num_agendas: {}, num_articles: {}".format(
        start_date.strftime("%m/%d"),
        end_date.strftime("%m/%d"),
        subset_metadata.shape[0],
        len(committees),
#             num_articles,
#             found,
        total
    )
)

09/28 & 10/12 & 25 & 17 & 3 \\
