In [1]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# load language model (this takes a few minutes)
model = api.load('word2vec-google-news-300')

In [58]:
# specify keywords
keywords = {
    'development': [
        'residential',
        'housing',
        'incentive',
        'subdivision',
        'improvement',
    ],
    'zoning': [
        'zoning',
        'rezoning',
        'parcel',
    ],
    'staff': [
        'vacancy',
        'vacancies',
        'appointment',
        'candidate',
        'commissioner',
        'resign',
        'resignation'
    ]
}

In [54]:
# load csv from disk
csv_path = '../data/misc/gridley_agendas.csv'
agendas = pd.read_csv(csv_path)

In [55]:
var_name = 'CONSENT AGENDA'

from autolocal.nlp import Tokenizer
from gensim.parsing.preprocessing import *

tokenizer = Tokenizer()
similarities = []
idx = []
for i, row in agendas.iterrows():
    s = row[var_name]    
    if s and not pd.isna(s):
        preprocess_filters = [
            lambda x: x.lower(),
            strip_punctuation,
            strip_numeric,
            strip_non_alphanum,
            strip_multiple_whitespaces,
            strip_numeric,
            remove_stopwords,
            strip_short]
        tokens = preprocess_string(s, preprocess_filters)
        tokens = [t for t in tokens if t in model.vocab]
        tokens = tokens[27:]       
        
        sim = {k: model.n_similarity(tokens, v) for k, v in keywords.items()}
        sim['raw_text'] = ' '.join(tokens)
        similarities.append(sim)
        idx.append(i)

In [56]:
sims_df = pd.DataFrame(similarities, index=idx, columns=list(keywords.keys()) + ['raw_text'])
print(sims_df.head())

   development    zoning     staff  \
0     0.431602  0.483039  0.367580   
1     0.287971  0.298341  0.272866   
2     0.429795  0.459589  0.447017   
3     0.448220  0.506095  0.350080   
4     0.369767  0.430613  0.473593   

                                            raw_text  
0  city council minutes dated september approval ...  
1               city council minutes dated september  
2  city council minutes dated august consideratio...  
3  city council minutes dated august approval res...  
4  city council minutes dated july appointment ne...  


In [60]:
for c in keywords.keys():
    print(c.upper())
    idx_top_matches = (sims_df.loc[:,c].sort_values(ascending=False)[:3].index)
    for i in idx_top_matches:
        print(sims_df.loc[i,c])
        print(sims_df.loc[i,'raw_text'])
        print()

DEVELOPMENT
0.5937261581420898
city council minutes dated march revision time homebuyer program guidelines single family housing rehabilitation program guidelines project list california transportation commission

0.5738751888275146
city council minutes dated december intergovernmental agency agreement city regional housing authority housing consultant services

0.5506479740142822
city services update city council minutes dated april adopt resolution resolution authorizing city administrator execute deferred improvement agreement defer construction sidewalk improvements located kentucky street approval new agreement north valley shooters new rates range users boat ramp users project list california transportation commission

ZONING
0.54632568359375
city services update city council minutes dated july resolutions authorizing levy assessment district expenses butte county tax roll resolution resolution city council city levy assessment district butte county tax roll resolution resolution

In [40]:
sims_df.loc[:,'development'].sort_values(ascending=False)[:3].index

Int64Index([11, 18, 32], dtype='int64')