In [11]:
# Import statements

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import json

from datetime import datetime
import os
from autolocal.nlp import Tokenizer
from gensim.parsing.preprocessing import *

In [4]:
# set up word vectors
# (this takes a loooong time)

print("loading language model")
# load language model (this takes a few minutes)
model = api.load('word2vec-google-news-300')
print("model loaded")

vectors = model.wv
del model
vectors.init_sims(True) # normalize the vectors (!), so we can use the dot product as similarity measure

print('embeddings loaded ')
print('loading docs ... ')

loading language model
model loaded



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



embeddings loaded 
loading docs ... 


In [7]:
def get_txt_name(f):
    path_parts = os.path.split(f)
    fname = path_parts[1]
    local_dir = os.path.basename(path_parts[0])
    return os.path.join(local_dir, fname)[:-4]+".txt"

In [8]:
def read_metadata():
    # return data frame with metadata
    metadata = pd.read_csv("../data/index_sfbay_small/metadata.csv")
    metadata["txt_file"] = [get_txt_name(f) for f in metadata["local_path_pdf"]]
    metadata = metadata[["txt_file", "city", "committee", "date"]]
    metadata["date"] = [datetime.strptime(d, '%Y-%m-%d') for d in metadata["date"]]
    return metadata

Unnamed: 0,txt_file,city,committee,date
0,cupertino/Cupertino_2019-10-29_Legislative-Rev...,Cupertino,Legislative Review Committee,2019-10-29
1,cupertino/Cupertino_2019-10-28_Planning-Commis...,Cupertino,Planning Commission,2019-10-28
2,cupertino/Cupertino_2019-10-24_Administrative-...,Cupertino,Administrative Hearing,2019-10-24
3,cupertino/Cupertino_2019-10-24_Sustainability-...,Cupertino,Sustainability Commission,2019-10-24
4,cupertino/Cupertino_2019-10-23_Teen-Commission...,Cupertino,Teen Commission,2019-10-23
...,...,...,...,...
17204,burlingame/Burlingame_2019-09-05_Beautificatio...,Burlingame,Beautification Commission,2019-09-05
17205,burlingame/Burlingame_2019-09-03_City-Council_...,Burlingame,City Council,2019-09-03
17206,burlingame/Burlingame_2019-08-26_Planning-Comm...,Burlingame,Planning Commission,2019-08-26
17207,burlingame/Burlingame_2019-08-20_Library-Board...,Burlingame,Library Board of Trustees,2019-08-20


In [12]:
def read_queries():
    # read from input file
    # parse as json
    # return list of queries
    with open('../data/user_data/sample_input_data.json') as json_file:
        queries = json.load(json_file)
    return queries

read_queries()

[{'user_id': 'emily',
  'query_id': 1,
  'keywords': ['affordable',
   'housing',
   'ADU',
   'vote',
   'residential',
   'homeless'],
  'municipalities': ['San Jose',
   'Cupertino',
   'Sunnyvale',
   'Palo Alto',
   'Mountain View'],
  'time_window': 'upcoming_only'},
 {'user_id': 'some_other_journalist',
  'query_id': 2,
  'keywords': ['affordable',
   'housing',
   'ADU',
   'vote',
   'residential',
   'homeless'],
  'time_window': 'upcoming_only',
  'municipalities': ['Biggs', 'Gridley']}]

In [14]:
def read_history(): 
    with open('../data/user_data/sample_history.json') as json_file:
        history = json.load(json_file)
    return history
read_history()

{'user_id': [{'filename': 'filename1', 'segment': 'segment1'}]}

In [16]:
tokenizer = Tokenizer()

# TODO: is lowercasing necessary?
preprocess_filters = [
    lambda x: x.lower(),
    strip_punctuation,
    strip_numeric,
    strip_non_alphanum,
    strip_multiple_whitespaces,
    strip_numeric,
    remove_stopwords,
    strip_short
]

In [18]:
def read_docs(metadata):
    # read all documents that we know about
    # tokenize each document
    # return list of documents
    

    documents = []

    document_files = metadata["txt_file"]
    directory = '../data/docs'
    for filename in document_files:
        try:
            f = open(os.path.join(directory, filename))
            
            doc_string = f.read()
            doc_tokens = preprocess_string(doc_string, filters=preprocess_filters)
            
            documents.append({"original_text": doc_string,
                              "tokens": doc_tokens,
                              "filename": filename})
        except FileNotFoundError:
            pass
    
    return documents

In [4]:
def read_cached_idf(): 
    # read cached idf file and conver to json
    
read_cached_idf()

In [6]:
example_results = [
  {
    "user_id": "emily",
   'special',
   'meeting',
   'subject',
   'approve',
   'august',
    "document_sections": [
      {
        "section_id": "000",
        "doc_url": "https://sanjose.legistar.com/View.ashx?M=A&ID=709096&GUID=CF2165D1-28CB-4670-AB54-F35B649DE71D",
        "doc_name": "Agenda 2019-09-24",
        "user_id": "emily",
        "page_number": "11",
        "keywords": ["High-Rise", "Incentive", "Affordable", "Housing", "exemption", "tax", "reduction"],
        "text": "4.3 19-821 Downtown High-Rise Incentive Program.\n\nRecommendation: Accept the report on the Downtown High-Rise Feasibility Assessment and direct staff to return to Council with the appropriate ordinance and resolution to enact the following:\n\n(a) Extending the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption to December 31, 2023.\n(b) Amending Title 4.46 and align the construction tax reduction with the certificate of occupancy deadline for the Affordable Housing Impact Fee exemption, and removing the planning and build permit requirements.\nCEQA: Not a Project, File No. PP17-009, Staff Reports, Assessments, Annual Reports, and Informational Memos that involve no approvals of any City action. (Economic Development)"
      },
      {
        "section_id": "001",
        "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
        "doc_name": "School Board Agenda, 2019-9-23",
        "user_id": "emily",
        "page_number": "NA",
        "keywords": ["employee", "housing", "affordable"],
        "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
      },
      {
        "section_id": "002",
        "doc_url": "https://agendaonline.net/public/Meeting.aspx?AgencyID=123&MeetingID=20136&AgencyTypeID=1&IsArchived=True",
        "doc_name": "School Board Agenda, 2019-9-23",
        "user_id": "emily",
        "page_number": "NA",
        "keywords": ["employee", "housing", "affordable"],
        "text": "A.3. Master Plan for San Jose Unified Properties - Step 3 (ACTION)\n\nRECOMMENDATION: That San Jose Unified secure pre-development services to complete a pre-development analysis on the potential for employee housing opportunities at the following four locations: (1) 855 Lenzen Avenue, San Jose Unified District Offices, (2) 1088 Broadway, River Glen K-8 School, (3) 1325 Bouret Drive, Second Start-Pine Hill Non-Public School, and (4) 760 Hillsdale Avenue and 705-745 Capital Expressway, Metropolitan Education District."
      }
    ]
  }
]
example_top_k_sections = []
example_query = {
    "original_text": "I am a document document",
    "tokens": ["I", "am"],
    "filename": "filename1",
    "section_id": "section1",
}

In [7]:
def write_output(results): 
    pass
write_output(example_results)

In [6]:
def update_history(history, top_k_sections, query): 
    pass
update_history(read_history(), example_top_k_sections, example_query)

In [19]:
def write_history(history): 
    pass
write_history(read_history())

In [8]:
def calculate_idf(all_docs): 
    # for each word, how many unique docs does it show up in?
    from collections import Counter
    word_counts = 
    
    idf = {}
    for document in all_docs: 
        tokens = all_docs[document]["tokens"]
        for 
        "I": 1./2,
        "am": 1./2,
        "a": 1./1,
        "another": 1./1,
        "document": 1./1,
        "doc": 1./1
    }

In [9]:
def cache_idf(idf):
    # write the idf dict to a file
    pass

In [10]:
def segment_docs(relevant_docs, keywords): 
    return [
        {
            "original_text": "I am a document document",
            "tokens": ["I", "am"],
            "filename": "filename1",
            "section_id": "section1",
        },
        {
            "original_text": "I am a document document",
            "tokens": ["a", "document"],
            "filename": "filename1",
            "section_id": "section2",
        },
        {
            "original_text": "I am another doc",
            "tokens": ["I", "am", "another", "doc"],
            "filename": "filename2",
            "section": "section3"
        }
    ]

In [11]:
def select_relevant_docs(municipalities, time_window, all_docs, metadata): 
    # filter metadata to only those files that match the query municipality and time_window
    # filter all docs to only filenames in subset of metadata
    return [
        {
            "original_text": "I am a document document",
            "tokens": ["I", "am", "a", "document"],
            "filename": "filename1"
        },
        {
            "original_text": "I am another doc",
            "tokens": ["I", "am", "another", "doc"],
            "filename": "filename2"
        }
    ]

In [12]:
def score_doc_section(doc_section, keywords, idf):
    # vectorize etc.
    keyword_vectors = np.array([vectors[t] for t in keywords if t in inverse_doc_props])
    keyword_weights = np.array([inverse_doc_props[t] for t in keywords if t in inverse_doc_props])
    document_section_scores = []
    for s, section in enumerate(document_sections):
        score = None
        section_tokens = section[0]
        # TODO: Zipf to figure out what the cutoff should be for normal communication
        if len(set(section_tokens))<20:
            score = 0
        else:
            section_vectors = np.array([vectors[t] for t in section_tokens if t in inverse_doc_props])
            if section_vectors.shape[0]>0:
    #             section_weights = np.array([inverse_doc_props[t] for t in section_tokens if t in inverse_doc_props])
                similarities = cosine_similarity(section_vectors, keyword_vectors)
    #             similarities = similarities * section_weights
    #             similarities = similarities*(similarities>0.2)
                keyword_similarities = np.mean(similarities, axis=0)
    #             keyword_similarities = np.average(similarities, axis=0, weights=section_weights)
                score = np.sum(keyword_similarities*keyword_weights)
        document_section_scores.append(score)

In [13]:
def select_top_k(doc_sections, doc_scores, k, history): 
    pass

In [14]:
def update_results(results, top_k_sections, query): 
    pass

In [15]:
def run_queries(use_cached_idf = False): 
    queries = read_queries()
    metadata = read_metadata()
    all_docs = read_docs(metadata)
    if use_cached_idf:
        idf = read_cached_idf()
    else:
        idf = calculate_idf(all_docs)
        cache_idf(idf)
    history = read_history()
    
    results = []
    
    for query in queries: 
        user_id = query["user_id"]
        query_id = query["query_id"]
        keywords = query["keywords"]
        time_window = query["time_window"]
        municipalities = query["municipalities"]
        relevant_docs = select_relevant_docs(municipalities, time_window, all_docs, metadata)
        doc_sections = segment_docs(relevant_docs, keywords)
        doc_sections_scores = score_doc_section(doc_section, keywords, idf)
        top_k_sections = select_top_k(doc_sections, doc_scores, k, history)
        results = update_results(results, top_k_sections, query)
        history = update_history(history, top_k_sections, query)
        
    write_output(results)
    write_history(history)
            

    