In [3]:
import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

webhoseio.config(token=os.environ['WEBHOSE_TOKEN'])
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

KeyError: 'WEBHOSE_TOKEN'

In [2]:
# be careful how many times you make this call
# get the first batch
output = webhoseio.query("filterWebContent", query_params)

In [3]:
for feed in output['posts']:
    print(str(feed['title']) + '|||' + str(feed['published']))

Google Alert - vacation|||2019-09-23T02:11:00.000+03:00
Tesla Vows Sustainable Profitability As Quarterly Loss Blows Past Expectations|||2019-09-23T10:25:00.000+03:00
Scant Liquidity, Continued Spending And Inflated Deposits Highlight Tesla's 10-Q Disclosures|||2019-09-23T10:25:00.000+03:00
Tesla CEO Elon Musk pours cold water on fans waiting for Model S, Model X refresh (ejcy)|||2019-09-23T03:00:00.000+03:00
Elon Musk’s potential $56-billion payday must be defended by Tesla board in court|||2019-09-23T13:05:00.000+03:00
Elon Musk’s potential $56-billion payday must be defended by Tesla board in court|||2019-09-23T13:05:00.000+03:00
Convertible BMW E30 Gets New Life With Volt Battery, Tesla Motor Swap|||2019-09-23T15:09:00.000+03:00
Is the Tesla 3 a realistic option for a regular commute from Cardiff to Heathrow?|||2019-09-23T13:07:00.000+03:00
Tesla Driver Arrested After Reportedly Driving to Oakland the Wrong Way on the Bay Bridge|||2019-09-23T03:00:00.000+03:00
Reliability of high-r

In [4]:
feed_titles = []

for feed in output['posts']:
    feed_titles.append(str(feed['title']))

print("Total number of titles: " + str(len(feed_titles)))

Total number of titles: 100


In [6]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [10]:
def clstr_lda(num_topics, stories):
    # top words to be identified
    n_top_words = 10

    tf_vectorizer = CountVectorizer(max_df=100, min_df=2, max_features=500,
                                    tokenizer=tokenize_titles, ngram_range=(3,4))

    tf = tf_vectorizer.fit_transform(stories)

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=200,
                                    learning_method='online', learning_offset=10.,
                                    random_state = 1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # print top topic words
    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #%d:" % topic_idx)
        print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    return topics

In [11]:
topics = clstr_lda(10, feed_titles)

Topic #0:
netflix  youtube  |  youtube  | netflix  youtube |  tsla  | quarter  tsla  | leak email tesla | delivery record quarter | musk say leak | break delivery record | elon musk say
Topic #1:
vehicles spontaneously combust |  fully engulf flame | flame   truck |   truck |  truck haul tesla | tesla vehicles spontaneously |  fully engulf | truck haul tesla | truck haul tesla vehicles | engulf flame 
Topic #2:
lead light magnetic levitation | tire center lamp | accessories model 3 | levitation waterproof modification  | light modify accessories model | white light  442 | modification  tire | lamp light modify accessories | white light  | waterproof modification  tire
Topic #3:
leak deliveries email | tesla share jump leak | jump leak deliveries | deliveries email  entertainment | entertainment software update | share jump leak | email  entertainment | leak deliveries email  | tesla share jump |  entertainment software
Topic #4:
100000 cars quarter | tesla jump report musk | jump repor

In [None]:
#output = webhoseio.get_next()