In [1]:
# Import statements

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import json

from datetime import *
import os
from autolocal.parsers.nlp import Tokenizer
from gensim.parsing.preprocessing import *

from autolocal.databases import S3DocumentManager
import boto3
from decimal import *

In [69]:
# set up word vectors
# (this takes a loooong time)
def setup_word_vectors():
  print("loading language model")
  # load language model (this takes a few minutes)
  model = gensim.load('word2vec-google-news-300')
  print("model loaded")

  vectors = model.wv
  del model
  vectors.init_sims(True) # normalize the vectors (!), so we can use the dot product as similarity measure

  print('embeddings loaded ')
  print('loading docs ... ')
  return vectors

In [18]:
def read_metadata():
    table = boto3.resource('dynamodb', region_name='us-west-1').Table('autolocal-documents')
    s3_client = boto3.client('s3')
    metadata = pd.DataFrame(table.scan()["Items"])
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    return metadata

In [31]:
class DocTextReader():
    def __init__(self, log_every=100):
        self.log_every = log_every
        s3 = boto3.resource('s3', region_name='us-west-1')
        self.bucket = s3.Bucket('autolocal-documents')
                
        # TODO: is lowercasing necessary?
        self.preprocess_filters = [
            lambda x: x.lower(),
            strip_punctuation,
            strip_numeric,
            strip_non_alphanum,
            strip_multiple_whitespaces,
            strip_numeric,
            remove_stopwords,
            strip_short
        ]

    def read_document_string(self, s3_path):
        return self.bucket.Object(s3_path).get()['Body'].read()

    def read_docs(self, s3_paths):
        # read all documents that we know about
        # tokenize each document
        # return list of documents

        documents = {}
        n_docs_total = len(s3_paths)

        i = 0
        n_docs_read = 0
        for s3_path in s3_paths:
            try:
                doc_string = self.read_document_string(s3_path)
                doc_tokens = preprocess_string(doc_string, filters=self.preprocess_filters)
                documents[s3_path] = {
                    "original_text": doc_string,
                    "tokens": doc_tokens
                }
            except Exception as e:
                if i < 10:
                    print("Key not found: {}".format(s3_path))
                elif i == 10:
                    print("More than 10 keys not found")
                    print(e)
                    break
                i+=1
            if n_docs_read % self.log_every == 0:
                print("{} of {} documents read".format(n_docs_read, n_docs_total))
            n_docs_read+=1

        return documents

In [41]:
def read_queries(query_source):
    if query_source == "actual":
        table = boto3.resource('dynamodb', region_name='us-west-2').Table('autoLocalNews')
    elif query_source == "quick":
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('quick_queries')
    else:
        raise Exception
    queries = table.scan()["Items"]
    return queries

In [5]:
def read_history():
    try:
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('history')
        history = table.scan()["Items"]
    except:
        history = []
    return history

In [6]:
def read_cached_idf():
    try:
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('idf')
        cached_idf = table.scan()["Items"]
    except:
        cached_idf = None
    idf = {}
    for word in cached_idf:
        w = word['word']
        idf[w] = float(word['idf'])
    return idf

In [7]:
def cache_idf(idf):
    dynamodb = boto3.resource('dynamodb')
    try:
        table = boto3.resource('dynamodb', region_name='us-west-1').Table('idf')
    except:
        table_args = {
            'TableName': 'idf',
            'KeySchema': [
                {
                    'AttributeName': 'word',
                    'KeyType': 'HASH'
                }
            ],
            'AttributeDefinitions': [
                {
                    'AttributeName': 'word',
                    'AttributeType': 'S'
                }        
            ],
            'ProvisionedThroughput': {
                'ReadCapacityUnits': 5,
                'WriteCapacityUnits': 5
            }
        }

        table = dynamodb.create_table(**table_args)
    with table.batch_writer() as batch:
        for word in idf:
            item = {'word': word, 'idf': idf[word]}
            item_dump = json.dumps(item)
            item = json.loads(item_dump, parse_float=Decimal)
            batch.put_item(Item=item)

In [8]:
def calculate_idf(all_docs): 
    # for each word, how many unique docs does it show up in?
    from collections import Counter
    
    doc_freq = {}
    for document in all_docs: 
        tokens = all_docs[document]["tokens"]
        for token in tokens:
            if token in doc_freq:
                doc_freq[token] += 1
            else:
                doc_freq[token] = 1
    
    inverse_doc_freq = {}
    for word in doc_freq:
        inverse_doc_freq[word] = 1./doc_freq[word]
    
    return inverse_doc_freq

In [10]:
time_windows = {
    'this_week': datetime.now() - timedelta(weeks=1),
    'this_year': datetime.now() - timedelta(days=365),
    'past_six_months':datetime.now() - timedelta(days=183),
    'all': None
}

def find_relevant_filenames(queries, metadata): 
    
    # filter metadata to only those files that match the query municipality and time_window
    municipalities_by_time_window = {}
    for query in queries:
        time_window = query['Time Window']
        if time_window in municipalities_by_time_window:
            municipalities_by_time_window[time_window].update(query['Municipalities'])
        else:
            municipalities_by_time_window[time_window] = set(query['Municipalities'])
            
    relevant_filenames = set()
    for time_window in municipalities_by_time_window:
        starting_date = time_windows[time_window]
        potential_documents = metadata
        if starting_date:
            potential_documents = potential_documents[potential_documents["date"] >= starting_date]
        cities = municipalities_by_time_window[time_window]
        potential_documents = potential_documents[[(c in cities) for c in potential_documents["city"]]]
        relevant_filenames.update(potential_documents['local_path_txt'])
    return relevant_filenames

In [70]:
# def write_quick_queries():
#     example_queries = [
#       {
#         "id": "emily",
#         "Keywords": [
#           "affordable",
#           "housing",
#           "ADU",
#           "vote",
#           "residential",
#           "homeless"
#         ],
#         "Municipalities": [
#           "San Jose",
#           "Cupertino",
#           "Sunnyvale",
#           "Palo Alto",
#           "Mountain View"
#         ],
#         "Time Window": "this_week"
#       },
#       {
#         "id": "some_other_journalist",
#         "Keywords": [
#           "affordable",
#           "housing",
#           "ADU",
#           "vote",
#           "residential",
#           "homeless"
#         ],
#         "Time Window": "past_six_months",
#         "Municipalities": [
#           "Sunnyvale",
#           "San Jose"
#         ]
#       }
#     ]
#     dynamodb = boto3.resource('dynamodb')
#     try:
#         print('reading')
#         table = dynamodb.Table('quick_queries')
#         table.scan()
#     except:
#         print('making')
#         table_args = {
#             'TableName': 'quick_queries',
#             'KeySchema': [
#                 {
#                     'AttributeName': 'id',
#                     'KeyType': 'HASH'
#                 }
#             ],
#             'AttributeDefinitions': [
#                 {
#                     'AttributeName': 'id',
#                     'AttributeType': 'S'
#                 }        
#             ],
#             'ProvisionedThroughput': {
#                 'ReadCapacityUnits': 5,
#                 'WriteCapacityUnits': 5
#             }
#         }

#         table = dynamodb.create_table(**table_args)
#     print('writing')
#     with table.batch_writer() as batch:
#         for item in example_queries:
#             item_dump = json.dumps(item)
#             item = json.loads(item_dump, parse_float=Decimal)
#             batch.put_item(Item=item)
# write_quick_queries()

In [6]:
# example_results = [
#   {
#     "user_id": "emily",
#    'special',
#    'meeting',
#    'subject',
#    'approve',
#    'august',
#     "document_sections": [
#       {
#         "section_id": "000",
#         "doc_url": "https://sanjose.legistar.com/View.ashx?M=A&ID=709096&GUID=CF2165D1-28CB-4670-AB54-F35B649DE71D",
#         "doc_name": "Agenda 2019-09-24",
#         "user_id": "emily",
#         "page_number": "11",
#         "keywords": ["High-Rise", "Incentive", "Affordable", "Housing", "exemption", "tax", "reduction"],
#         "text": "4.3 19-821 Downtown High-Rise Incentive Program.\n\nRecommendation: Accept the report on the Downtown High-Rise Feasibility Assessment and direct staff to return to Council with the appropriate ordinance and resolution to enact the following:\n\n(a) Extending the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption to December 31, 2023.\n(b) Amending Title 4.46 and align the construction tax reduction with the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption, and removing the planning and build permit requirements.\nCEQA: Not a Project, File No. PP17-009, Staff Reports, Assessments, Annual Reports, and Informational Memos that involve no approvals of any City action. (Economic Development)"
#       },
#       {
#         "section_id": "001",
#         "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
#         "doc_name": "School Board Agenda, 2019-9-23",
#         "user_id": "emily",
#         "page_number": "NA",
#         "keywords": ["employee", "housing", "affordable"],
#         "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
#       },
#       {
#         "section_id": "002",
#         "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
#         "doc_name": "School Board Agenda, 2019-9-23",
#         "user_id": "emily",
#         "page_number": "NA",
#         "keywords": ["employee", "housing", "affordable"],
#         "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
#       }
#     ]
#   }
# ]
# example_top_k_sections = []
# example_query = {
#     "original_text": "I am a document document",
#     "tokens": ["I", "am"],
#     "filename": "filename1",
#     "section_id": "section1",
# }

In [62]:
def send_emails(results): 
    pass

In [54]:
def update_history(history, top_k_sections, query): 
    pass

In [55]:
def write_history(history): 
    pass

In [8]:
def select_relevant_docs(municipalities, time_window, all_docs, metadata):
    pass

In [56]:
def segment_docs(relevant_docs, keywords): 
    return [
        {
            "original_text": "I am a document document",
            "tokens": ["I", "am"],
            "filename": "filename1",
            "section_id": "section1",
        },
        {
            "original_text": "I am a document document",
            "tokens": ["a", "document"],
            "filename": "filename1",
            "section_id": "section2",
        },
        {
            "original_text": "I am another doc",
            "tokens": ["I", "am", "another", "doc"],
            "filename": "filename2",
            "section": "section3"
        }
    ]

In [57]:
def score_doc_section(doc_section, keywords, idf):
#     # vectorize etc.
#     keyword_vectors = np.array([vectors[t] for t in keywords if t in inverse_doc_props])
#     keyword_weights = np.array([inverse_doc_props[t] for t in keywords if t in inverse_doc_props])
#     document_section_scores = []
#     for s, section in enumerate(document_sections):
#         score = None
#         section_tokens = section[0]
#         # TODO: Zipf to figure out what the cutoff should be for normal communication
#         if len(set(section_tokens))<20:
#             score = 0
#         else:
#             section_vectors = np.array([vectors[t] for t in section_tokens if t in inverse_doc_props])
#             if section_vectors.shape[0]>0:
#     #             section_weights = np.array([inverse_doc_props[t] for t in section_tokens if t in inverse_doc_props])
#                 similarities = cosine_similarity(section_vectors, keyword_vectors)
#     #             similarities = similarities * section_weights
#     #             similarities = similarities*(similarities>0.2)
#                 keyword_similarities = np.mean(similarities, axis=0)
#     #             keyword_similarities = np.average(similarities, axis=0, weights=section_weights)
#                 score = np.sum(keyword_similarities*keyword_weights)
#         document_section_scores.append(score)
    pass

In [58]:
def select_top_k(doc_sections, doc_scores, k, history): 
    pass

In [59]:
def update_results(results, top_k_sections, query): 
    pass

In [68]:
# vectors = setup_word_vectors()
def run_queries(use_cached_idf = False, query_source="actual", k=3): 
    print("reading queries")
    queries = read_queries(query_source)
    print("reading metadata")
    metadata = read_metadata()
    print("setting up reader")
    doc_text_reader = DocTextReader(log_every=100)
    if use_cached_idf:
        # used cached idf and only read relevant documents
        print("loading cached idf")
        idf = read_cached_idf()
        print("finding relevant filenames")
        relevant_filenames = find_relevant_filenames(queries, metadata)
        # (not actually *all*, but all the ones we care about for queries)
        print("reading relevant documents")
        all_docs = doc_text_reader.read_docs(relevant_filenames)
    else:
        # read all documents and calculate inverse document frequency
        all_filenames = metadata["local_path_txt"]
        print("reading all documents")
        all_docs = doc_text_reader.read_docs(all_filenames)
        print("calculating idf")
        idf = calculate_idf(all_docs)
        cache_idf(idf)
    print("reading history")
    history = read_history()
    
    results = []
    
    for q, query in enumerate(queries): 
        print("running query {} of {}".format(q, len(queries)))
        user_id = query["id"]
        keywords = query["Keywords"]
        time_window = query["Time Window"]
        municipalities = query["Municipalities"]
        relevant_docs = select_relevant_docs(municipalities, time_window, all_docs, metadata)
        doc_sections = segment_docs(relevant_docs, keywords)
        print("scoring documents")
        doc_sections_scores = [score_doc_section(doc_section, keywords, idf) for doc_section in doc_sections]
        top_k_sections = select_top_k(doc_sections, doc_sections_scores, k, history)
        results = update_results(results, top_k_sections, query)
        history = update_history(history, top_k_sections, query)
        
    print("sending emails")
    send_emails(results)
    write_history(history)
    print("finished")


run_queries(use_cached_idf=True, query_source="quick")

reading queries
reading metadata
setting up reader
loading cached idf
finding relevant filenames
reading relevant documents
0 of 34 documents read
reading history
running query 0 of 2
scoring documents
running query 1 of 2
scoring documents
sending emails
finished
