In [1]:
# Import statements

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
import gensim.downloader as gensim
from sklearn.metrics.pairwise import cosine_similarity
import json

from datetime import *
import os
from autolocal.parsers.nlp import Tokenizer
from gensim.parsing.preprocessing import *

from autolocal.databases import S3DocumentManager
import boto3
from decimal import *

In [2]:
# set up word vectors
# (this takes a loooong time)
def setup_word_vectors():
  print("loading language model")
  # load language model (this takes a few minutes)
  model = gensim.load('word2vec-google-news-300')
  print("model loaded")

  vectors = model.wv
  del model
  vectors.init_sims(True) # normalize the vectors (!), so we can use the dot product as similarity measure

  print('embeddings loaded ')
  print('loading docs ... ')
  return vectors

In [3]:
def read_metadata():
    table = boto3.resource('dynamodb', region_name='us-west-1').Table('autolocal-documents')
    s3_client = boto3.client('s3')
    metadata = pd.DataFrame(table.scan()["Items"])
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    return metadata

In [4]:
# TODO: is lowercasing necessary?
preprocess_filters = [
    lambda x: x.lower(),
    strip_punctuation,
    strip_numeric,
    strip_non_alphanum,
    strip_multiple_whitespaces,
    strip_numeric,
    remove_stopwords,
    strip_short
        ]

class DocTextReader():
    def __init__(self, log_every=100):
        self.log_every = log_every
        s3 = boto3.resource('s3', region_name='us-west-1')
        self.bucket = s3.Bucket('autolocal-documents')

    def read_document_string(self, s3_path):
        return self.bucket.Object(s3_path).get()['Body'].read()

    def read_docs(self, s3_paths):
        # read all documents that we know about
        # tokenize each document
        # return list of documents

        documents = {}
        n_docs_total = len(s3_paths)

        i = 0
        n_docs_read = 0
        for s3_path in s3_paths:
            try:
                doc_string = self.read_document_string(s3_path)
                doc_tokens = preprocess_string(doc_string, filters=preprocess_filters)
                documents[s3_path] = {
                    "original_text": doc_string,
                    "tokens": doc_tokens
                }
            except Exception as e:
                if i < 10:
                    print("Key not found: {}".format(s3_path))
                elif i == 10:
                    print("More than 10 keys not found")
                    print(e)
                    break
                i+=1
            if n_docs_read % self.log_every == 0:
                print("{} of {} documents read".format(n_docs_read, n_docs_total))
            n_docs_read+=1

        return documents

In [5]:
# r = DocTextReader()
# all_docs = r.read_docs(read_metadata()["local_path_txt"])

In [6]:
# idf = calculate_idf(all_docs)
# len(idf.keys())

In [7]:
def read_queries(query_source):
    if query_source == "actual":
        table = boto3.resource('dynamodb', region_name='us-west-2').Table('autoLocalNews')
    elif query_source == "quick":
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('quick_queries')
    else:
        raise Exception
    queries = table.scan()["Items"]
    return queries

In [8]:
def read_history():
    try:
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('history')
        history = table.scan()["Items"]
    except:
        history = []
    return history

In [9]:
def read_cached_idf():
    s3 = boto3.resource('s3', region_name='us-west-1')
    bucket = s3.Bucket('autolocal-documents')
    idf = json.load(bucket.Object('idf.json').get()['Body'])
    return idf

In [10]:
def cache_idf(idf):
    s3 = boto3.resource('s3')
    object = s3.Object('autolocal-documents', 'idf.json')
    object.put(Body=json.dumps(idf))

In [11]:
def calculate_idf(all_docs): 
    # for each word, how many unique docs does it show up in?
    from collections import Counter
    
    doc_freq = {}
    for document in all_docs: 
        tokens = all_docs[document]["tokens"]
        for token in tokens:
            if token in doc_freq:
                doc_freq[token] += 1
            else:
                doc_freq[token] = 1
    
    inverse_doc_freq = {}
    for word in doc_freq:
        inverse_doc_freq[word] = 1./doc_freq[word]
    
    return inverse_doc_freq

In [26]:
time_windows = {
    'upcoming_only': datetime.now() + timedelta(days=0.5),
    'this_week': datetime.now() - timedelta(weeks=1),
    'this_year': datetime.now() - timedelta(days=365),
    'past_six_months':datetime.now() - timedelta(days=183),
    'all': None
}

def find_relevant_filenames(queries, metadata): 
    
    # filter metadata to only those files that match the query municipality and time_window
    municipalities_by_time_window = {}
    for query in queries:
        time_window = query['Time Window']
        if time_window in municipalities_by_time_window:
            municipalities_by_time_window[time_window].update(query['Municipalities'])
        else:
            municipalities_by_time_window[time_window] = set(query['Municipalities'])
            
    relevant_filenames = set()
    for time_window in municipalities_by_time_window:
        starting_date = time_windows[time_window]
        potential_documents = metadata
        if starting_date:
            potential_documents = potential_documents[potential_documents["date"] >= starting_date]
        cities = municipalities_by_time_window[time_window]
        potential_documents = potential_documents[[(c in cities) for c in potential_documents["city"]]]
        relevant_filenames.update(potential_documents['local_path_txt'])
    return relevant_filenames

In [13]:
def select_relevant_docs(municipalities, time_window, all_docs, metadata):
    # filter metadata to only those files that match the query municipality and time_window
    starting_date = time_windows[time_window]
    potential_documents = metadata
    if starting_date:
        potential_documents = potential_documents[potential_documents["date"] >= starting_date]
    potential_documents = potential_documents[[(c in municipalities) for c in potential_documents["city"]]]
    # filter all docs to only filenames in subset of metadata
    return [{**all_docs[f], 'filename':f} for f in potential_documents['local_path_txt']]

In [14]:
# TODO: Play with section length
# TODO: smart sectioning that's sensitive to multiple line breaks and other section break signals
# TODO: extract sections that overlap with each other
def segment_docs(relevant_docs):
    doc_sections = []
    approx_section_length = 100 # tokens
    min_section_length = 5
    
    for doc in relevant_docs:
        doc_tokens = doc["tokens"]
        original_text = doc["original_text"].decode('utf-8')
        filename = doc["filename"]
        
        doc_section_lines = []
        doc_section_tokens = []
        starting_page = 0
        starting_line = 0
        pages = original_text.split('\f')
        for p, page in enumerate(pages):
            lines = page.split('\n')
            for lnum, line in enumerate(lines):
                line_tokens = preprocess_string(line, filters=preprocess_filters)
                doc_section_tokens += line_tokens
                doc_section_lines.append(line)
                if len(doc_section_tokens) >= approx_section_length:
                    doc_sections.append({
                        **doc,
                        'starting_page': starting_page,
                        'starting_line': starting_line,
                        'ending_page': p,
                        'ending_line': lnum,
                        'section_text': '\n'.join(doc_section_lines),
                        'section_tokens': doc_section_tokens
                    })
                    doc_section_lines = []
                    doc_section_tokens = []
                    # have we reached the last line of this page?
                    if lnum == (len(lines)-1):
                        # next section starts at top of next page
                        starting_page = p+1
                        starting_line = 0
                    else:
                        # next section starts on next line of this page
                        starting_page = p
                        starting_line = lnum+1
        # end of the document
        if len(doc_section_tokens) >= min_section_length:
            doc_sections.append({
                **doc,
                'starting_page': starting_page,
                'ending_page': p,
                'section_text': '\n'.join(doc_section_lines),
                'section_tokens': doc_section_tokens
            })
            
    return doc_sections

In [15]:
def score_doc_sections(doc_sections, keywords, idf):
    # vectorize etc.
    # only consider keywords that have idf weights
    keywords = [keyword for keyword in keywords if (keyword in idf and keyword in vectors)]
    keyword_vectors = np.array([vectors[keyword] for keyword in keywords])
    keyword_weights = np.array([idf[keyword] for keyword in keywords])
    doc_sections_scores = []
    for s, section in enumerate(doc_sections):
        score = None
        section_tokens = section["section_tokens"]
        # TODO: Zipf to figure out what the cutoff should be for normal communication
        # If the number of unique tokens in the section is too small, it's probably not an interesting section
        if len(set(section_tokens))<20:
            score = 0
        else:
            section_vectors = np.array([vectors[t] for t in section_tokens if (t in idf and t in vectors)])
            print(section_vectors)
            break
            if section_vectors.shape[0]>0:
    #             section_weights = np.array([inverse_doc_props[t] for t in section_tokens if t in inverse_doc_props])
                similarities = cosine_similarity(section_vectors, keyword_vectors)
    #             similarities = similarities * section_weights
    #             similarities = similarities*(similarities>0.2)
                keyword_similarities = np.mean(similarities, axis=0)
    #             keyword_similarities = np.average(similarities, axis=0, weights=section_weights)
                score = np.sum(keyword_similarities*keyword_weights)
        doc_sections_scores.append(score)

    return doc_sections_scores

In [None]:
def select_top_k(doc_sections, doc_scores, k, history): 
    pass

In [16]:
# example_results = [
#   {
#     "user_id": "emily",
#    'special',
#    'meeting',
#    'subject',
#    'approve',
#    'august',
#     "document_sections": [
#       {
#         "section_id": "000",
#         "doc_url": "https://sanjose.legistar.com/View.ashx?M=A&ID=709096&GUID=CF2165D1-28CB-4670-AB54-F35B649DE71D",
#         "doc_name": "Agenda 2019-09-24",
#         "user_id": "emily",
#         "page_number": "11",
#         "keywords": ["High-Rise", "Incentive", "Affordable", "Housing", "exemption", "tax", "reduction"],
#         "text": "4.3 19-821 Downtown High-Rise Incentive Program.\n\nRecommendation: Accept the report on the Downtown High-Rise Feasibility Assessment and direct staff to return to Council with the appropriate ordinance and resolution to enact the following:\n\n(a) Extending the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption to December 31, 2023.\n(b) Amending Title 4.46 and align the construction tax reduction with the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption, and removing the planning and build permit requirements.\nCEQA: Not a Project, File No. PP17-009, Staff Reports, Assessments, Annual Reports, and Informational Memos that involve no approvals of any City action. (Economic Development)"
#       },
#       {
#         "section_id": "001",
#         "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
#         "doc_name": "School Board Agenda, 2019-9-23",
#         "user_id": "emily",
#         "page_number": "NA",
#         "keywords": ["employee", "housing", "affordable"],
#         "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
#       },
#       {
#         "section_id": "002",
#         "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
#         "doc_name": "School Board Agenda, 2019-9-23",
#         "user_id": "emily",
#         "page_number": "NA",
#         "keywords": ["employee", "housing", "affordable"],
#         "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
#       }
#     ]
#   }
# ]
# example_top_k_sections = []
# example_query = {
#     "original_text": "I am a document document",
#     "tokens": ["I", "am"],
#     "filename": "filename1",
#     "section_id": "section1",
# }

In [17]:
def send_emails(results): 
    pass

In [18]:
def update_history(history, top_k_sections, query): 
    pass

In [19]:
def write_history(history): 
    pass

In [21]:
def update_results(results, top_k_sections, query): 
    pass

In [22]:
vectors = setup_word_vectors()

loading language model
model loaded


  if __name__ == '__main__':


embeddings loaded 
loading docs ... 


In [24]:
# vectors = setup_word_vectors()
def run_queries(use_cached_idf = False, query_source="actual", k=3): 
    print("reading queries")
    queries = read_queries(query_source)
    print("reading metadata")
    metadata = read_metadata()
    print("setting up reader")
    doc_text_reader = DocTextReader(log_every=100)
    if use_cached_idf:
        # used cached idf and only read relevant documents
        print("loading cached idf")
        idf = read_cached_idf()
        print("finding relevant filenames")
        relevant_filenames = find_relevant_filenames(queries, metadata)
        # (not actually *all*, but all the ones we care about for queries)
        print("reading relevant documents")
        all_docs = doc_text_reader.read_docs(relevant_filenames)
    else:
        # read all documents and calculate inverse document frequency
        all_filenames = metadata["local_path_txt"]
        print("reading all documents")
        all_docs = doc_text_reader.read_docs(all_filenames)
        print("calculating idf")
        idf = calculate_idf(all_docs)
        cache_idf(idf)
    print("reading history")
    history = read_history()
    
    results = []
    
    for q, query in enumerate(queries): 
        print("running query {} of {}".format(q, len(queries)))
        user_id = query["id"]
        keywords = query["Keywords"]
        time_window = query["Time Window"]
        municipalities = query["Municipalities"]
        relevant_docs = select_relevant_docs(municipalities, time_window, all_docs, metadata)
        print("segmenting documents")
        doc_sections = segment_docs(relevant_docs)
        print("scoring documents")
        doc_sections_scores = score_doc_sections(doc_sections, keywords, idf)
        top_k_sections = select_top_k(doc_sections, doc_sections_scores, k, history)
        results = update_results(results, top_k_sections, query)
        history = update_history(history, top_k_sections, query)
        
    print("sending emails")
    send_emails(results)
    write_history(history)
    print("finished")


run_queries(use_cached_idf=True, query_source="quick")

reading queries
reading metadata
setting up reader
loading cached idf
finding relevant filenames
reading relevant documents
0 of 34 documents read
reading history
running query 0 of 2
segmenting documents
scoring documents
[[ 0.05176774  0.07251675 -0.00623518 ... -0.08048102  0.04841436
   0.07084007]
 [-0.01350302 -0.10802414  0.01358486 ...  0.06088634  0.10082254
   0.06874264]
 [-0.00111791  0.0292689  -0.00581955 ... -0.0218233  -0.00421489
  -0.00382978]
 ...
 [-0.06321549 -0.09482325  0.07480501 ... -0.04179247  0.08147775
  -0.05127479]
 [ 0.00783078  0.01052135 -0.04465551 ... -0.02971679 -0.06393126
  -0.07581798]
 [-0.03987353  0.05729975 -0.09865008 ... -0.04784824 -0.00271362
   0.06468374]]
running query 1 of 2
segmenting documents
scoring documents
[[ 0.00573791  0.0083151   0.0242159  ... -0.00680768  0.0700219
  -0.02159008]
 [ 0.08574822  0.05516469  0.01243349 ... -0.06173872  0.07288598
  -0.02944022]
 [-0.06255211 -0.0595556   0.04738228 ... -0.07453814 -0.0295905