In [61]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from itertools import combinations
import pickle
import traceback
import os, sys

In [62]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [63]:
def load_trained_model(filepath=None):
    model_unpickle = open(filepath, 'rb')
    model = pickle.load(model_unpickle)
    print(f"Successfully loaded model from file: {filepath}")
    return model

In [64]:
def transcript_text_preprocessing(event_text=None):
    tokens = tokenizer.tokenize(event_text.lower())
    tokens = [token for token in tokens if (token not in en_stop and not token.isdigit() and len(token) > 1)]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [65]:
def find_event_topics(event_text=None, model_filepath=None):
    try:
        processed_text = transcript_text_preprocessing(event_text)

        topic_model = load_trained_model(model_filepath)
        vectorizer_doc = TfidfVectorizer()
        vectorizer_doc.fit(processed_text)
        tfidf_doc = vectorizer_doc.transform(processed_text)
        tfidf_feature_names_doc = vectorizer_doc.get_feature_names()

        # docs x topics (basis vectors); not a probability distribution
        W = topic_model.transform(tfidf_doc) 

        # topics x term weights (coefficients)
        H = topic_model.components_

        # Display top words per topic
        idx_to_word = np.array(vectorizer_doc.get_feature_names())
        print('Topics found via TFIDF + NMF:')
        for i, topic in enumerate(H):
            print("Topic {}: {}".format(i, (", ".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]]))))

        all_topics = []
        for i, topic in enumerate(H):
            topic_phrases = []
            topic_phrases.append([str(x) for x in idx_to_word[topic.argsort()[-10:]]])
            all_topics.append(topic_phrases)

        return topic_phrases
    
    except Exception as e:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        return None

In [66]:
body_str = """
everybody thank you being here this is the finance and neighborhood committee for April 10th 2019 and it is somewhat after 2 p.m. I want to welcome you we only have one item on today's agenda and that's Council will 1 1 9490 regarding a supplemental appropriation for Waterfront local improvement district so before we get into that I'm going to ask for public comment I think we've got one person who is now becoming one of our frequent visitors David Haynes thank you and you have 2 minutes the director of The Waterfront Improvement and the council could pay attention to the details they're supposed to alleviate the oppressive accommodation to the modern wheel and I'm specifically talking about the metal clanging and banging man cover covers that are like latches latch doors they have them specifically right in the middle of a Highway Lane of traffic that's in the Waterfront street now and whatever set of tires are if it's at concrete truck there's three separate like oppressive PTSD and Noise banging and clanging and they have to do something about improving the Waterfront District bike not allowing any more of these latched manhole covers in the middle of the road what's the address more or less what's the history Crossing time you develop it don't allow any more of these metal clanging there's other places in this city that have these problems and people walk on the side of the road just too much of them to deal with You're Expecting of Waterfront is peace and quiet and like twenty-first-century let's keep like that on all the details you don't have any oppressive reminder for the 20th failure to like alleviating thank you thank you for bringing up the point appreciate that okay anybody else want to speak okay seeing them and let's move on to our first item Allison if you'd be so kind as to read that and thank you Brian and absolute at the team at Marshall got Dory item number one Council Bell 11949 0 and Ordnance relating to the central Waterfront Improvement program authorizing the director of The Office of the Waterfront and said that projects to execute an agreement with the Washington State Ferries for the construction zone abilities just port electrification and fairy festivals at Coleman ferry terminal thank you Marshal do you want to start with introductions Caleb Wagner City budget office staff councilmember legislation revises the 2019 adopted budget and the 2019 to 2020 for CIP for the Waterfront program the revisions include changes to the cash go for the Waterfront program that are related to the LI deformation ordinance that Council passed in January of 2019 it also make some other changes to the budget that came up after the budget was submitted to counsel last year updates you allow us to construct electrical improvements to support hybrid fairies it also add some external funding the next slide is actually one that we showed to counsel during the LI deformation this is the the funding changes that were made as part of that formation to cover the revised Lida amount we made a $5000000 cost-reduction in removing three Promenade kiosks add additional philanthropy we added some Waterfront MPD funds converting operating to capital for the Waterfront Park and we added some additional CPT bonds and some read and this budget actually revision changes the CIP to reflect those those decisions that were made thank you for being thank you for adding the slide and I just want to remind folks that the 8 million dollars that comes from our Metropolitan Park District was its not new money taken from any other Park had been a line item for operating cost for the park in the Waterfront but since things were delayed for a number of reasons including birth and a few others is that that reading money was recaptured in use for Capital so again no other Park in the city is being as losing any funds for them yes that's correct thank you the next one so the additional budget changes Beyond clid budget changes is our we are adding $2000000 to the Alaskan Way and pedestrian connections pie in this pie chart writing $2000000 from wash. Reimburse us for design cost slaps new money from wash. We're also adding 10 million dollars of from the Washington State Convention Center for Public benefits for their street vacation that Council approved last year they've already given us the first check this legislation allows us to authorize us to accept the funds into a be able to spend the money once again once you underscore that this funny is going to the pike-pine corridor east of Pike Place Market this is not Waterfront money but it does help us with the connection so is that it increases our total project cost to 724 million for the entire Waterfront program we also will so allow us to accept funds from Washington State ferry to build the electrical service to Colman dock Marshall mentioned that this this is a pretty remarkable design change we decided to reuse an existing sewer line to allow us to run the electrical cable without impacting the the ballast Island filled God has some tribal on Native American significance of that and also I just want to acknowledge to all the people who are living and working in turnout in and around the Waterfront that is not going to be dug up again thank you and thank you. activity and also for protecting the native Lambs I'll just mention wanted one additional thought on this and it is a very exciting first step that's taken place here with you had a long-standing partnership with the state and with Washington State Ferries where they are moving to essentially hybrid vehicles over time which will reduce their carbon footprint in the region the diesel fairies are significant source of carbon emissions and this was a late addition to the program that I was not originally scoped and our space for conduits for all former utilities on the waterfront is very limited and we were just we're very pleased to be able to make this happen and are able to to do so is Dory said by using a essentially what had been a leftover sewer main so not an actress or maybe the left repurposing that together provide the conduit without impacting any of the other resources in the area and really that was the last slide and you have any questions you have legislation be glad to answer them do you have anything you want to add Eric that I had a pre-briefing on this so I appreciate that you've already answered the questions that I had and I also want to acknowledge the tremendous amount of work that all of you have done and then over the last fourteen years have also done but thank you for that thank you for bringing us to this point cancel present here off do you have any other questions or thoughts I know that that this that you are well familiar with all of this to but if you have no further discussion I-90 is presented to nobody abstaining so this will go forward next Monday for further approval by the full Council very good thank you thank you I believe that this is the only item that we had today's agenda so I appreciate your coming thank you that in the meeting is adjourned
"""

In [67]:
find_event_topics(event_text=body_str, model_filepath='nmf_topic_model.pkl')

Successfully loaded model from file: nmf_topic_model.pkl
------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-65-e3b4692b939f>", line 12, in find_event_topics
    W = topic_model.transform(tfidf_doc)
  File "/Users/kendall/cdptools/env/lib/python3.7/site-packages/sklearn/decomposition/nmf.py", line 1321, in transform
    shuffle=self.shuffle)
  File "/Users/kendall/cdptools/env/lib/python3.7/site-packages/sklearn/decomposition/nmf.py", line 1036, in non_negative_factorization
    _check_init(H, (n_components, n_features), "NMF (input H)")
  File "/Users/kendall/cdptools/env/lib/python3.7/site-packages/sklearn/decomposition/nmf.py", line 60, in _check_init
    'but got %s ' % (whom, shape, np.shape(A)))
ValueError: Array with wrong shape passed to NMF (input H). Expected (15, 378), but got (15, 1000) 
------------------------------------------------------------
