### Dependancies

In [26]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.stem.porter import *
import numpy as np
np.random.seed(1110)
import nltk
nltk.download('wordnet')
from gensim import corpora, models

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hrith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Dataset
**RedditNews.csv** contains Daily News headlines from the Stock Market.
This data set can be found [here](https://www.kaggle.com/aaron7sun/stocknews/data#)

In [16]:
data = pd.read_csv('Datasets/stocknews/RedditNews.csv', error_bad_lines=False);
data_text = data[['News']]
data_text['index'] = data_text.index
documents = data_text

### Data Pre-processing
We will perform the following steps:
- **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 3 characters are removed.
- All **stopwords** are removed.
- Words are **lemmatized** — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are **stemmed** — words are reduced to their root form.

In [17]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [18]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
["Delhi's", 'air', 'quality', 'improves', 'after', '9', 'days', 'of', 'odd-even', 'formula,', 'pollution', 'levels', 'drop', 'sharply']


 tokenized and lemmatized document: 
['delhi', 'qualiti', 'improv', 'day', 'formula', 'pollut', 'level', 'drop', 'sharpli']


In [20]:
processed_docs = documents['News'].map(preprocess)
processed_docs[:10]

0    [year, woman, mexico, citi, final, receiv, bir...
1            [chief, back, athen, perman, olymp, host]
2          [presid, franc, say, brexit, donald, trump]
3    [british, polic, hour, notic, threaten, hunger...
4    [nobel, laureat, urg, greenpeac, stop, oppos, ...
5    [brazil, huge, spike, number, polic, kill, ahe...
6    [austria, highest, court, annul, presidenti, e...
7    [facebook, win, privaci, case, track, belgian,...
8    [switzerland, deni, muslim, girl, citizenship,...
9    [china, kill, million, innoc, medit, organ, re...
Name: News, dtype: object

### Bag of Words on the Dataset
*gensim.corpora.Dictionary(processed_docs)* creates a dictionary that stores the frequency of each word

*dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=1000)* filters out words that have less than 15 occurences and keeps the top 1000 words

In [35]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=1000)

0 alvarez
1 bear
2 birth
3 certif
4 citi
5 die
6 final
7 hour
8 later
9 lira
10 mexico


In [36]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(115, 1), (332, 1), (517, 1), (688, 1)]

### Training LDA with Bag of Words

In [37]:
bow_doc_4310 = bow_corpus[4310]
# for i in range(len(bow_doc_4310)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.072*"israel" + 0.050*"iran" + 0.035*"gaza" + 0.030*"protest" + 0.030*"isra" + 0.027*"nuclear" + 0.021*"palestinian" + 0.018*"power" + 0.015*"say" + 0.012*"minist"
Topic: 1 
Words: 0.027*"drug" + 0.019*"muslim" + 0.019*"mexico" + 0.014*"franc" + 0.013*"school" + 0.013*"court" + 0.013*"children" + 0.012*"say" + 0.011*"legal" + 0.011*"islam"
Topic: 2 
Words: 0.040*"year" + 0.031*"death" + 0.020*"prison" + 0.019*"kill" + 0.019*"iraq" + 0.017*"elect" + 0.016*"woman" + 0.015*"jail" + 0.015*"women" + 0.015*"murder"
Topic: 3 
Words: 0.036*"russian" + 0.035*"russia" + 0.025*"troop" + 0.019*"afghanistan" + 0.018*"syria" + 0.017*"presid" + 0.017*"militari" + 0.017*"georgia" + 0.016*"ship" + 0.014*"say"
Topic: 4 
Words: 0.020*"germani" + 0.015*"dead" + 0.015*"german" + 0.013*"abus" + 0.011*"rise" + 0.011*"earthquak" + 0.011*"turn" + 0.011*"look" + 0.010*"near" + 0.010*"photo"
Topic: 5 
Words: 0.052*"attack" + 0.047*"kill" + 0.031*"pakistan" + 0.031*"isra" + 0.026*"arrest" + 0.02

### Training of LDA with TF-IDF

In [38]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"china" + 0.009*"news" + 0.009*"ahmadinejad" + 0.009*"dead" + 0.009*"ship" + 0.008*"protest" + 0.008*"think" + 0.008*"train" + 0.008*"internet" + 0.007*"egypt"
Topic: 1 Word: 0.013*"gaza" + 0.012*"china" + 0.010*"water" + 0.009*"world" + 0.009*"school" + 0.008*"billion" + 0.008*"israel" + 0.008*"die" + 0.008*"million" + 0.008*"popul"
Topic: 2 Word: 0.020*"israel" + 0.020*"palestinian" + 0.019*"olymp" + 0.019*"isra" + 0.010*"taliban" + 0.009*"arab" + 0.008*"jewish" + 0.007*"win" + 0.007*"game" + 0.007*"gaza"
Topic: 3 Word: 0.018*"georgia" + 0.016*"troop" + 0.015*"bush" + 0.015*"russia" + 0.013*"zimbabw" + 0.010*"iraq" + 0.009*"iraqi" + 0.009*"trade" + 0.008*"greec" + 0.008*"crisi"
Topic: 4 Word: 0.011*"saudi" + 0.011*"world" + 0.010*"abus" + 0.010*"women" + 0.010*"minist" + 0.009*"child" + 0.008*"face" + 0.008*"prime" + 0.008*"year" + 0.007*"church"
Topic: 5 Word: 0.015*"human" + 0.015*"right" + 0.010*"wikileak" + 0.009*"crimin" + 0.008*"european" + 0.008*"assang" +

In [39]:
processed_docs[4310]

['delhi',
 'qualiti',
 'improv',
 'day',
 'formula',
 'pollut',
 'level',
 'drop',
 'sharpli']

In [40]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199676871299744	 
Topic: 0.045*"world" + 0.022*"year" + 0.018*"time" + 0.016*"india" + 0.013*"minist" + 0.013*"million" + 0.012*"food" + 0.011*"bank" + 0.011*"peopl" + 0.011*"get"

Score: 0.020009731873869896	 
Topic: 0.068*"china" + 0.022*"world" + 0.019*"parti" + 0.012*"govern" + 0.012*"worker" + 0.012*"iceland" + 0.011*"polit" + 0.010*"say" + 0.009*"chines" + 0.008*"burn"

Score: 0.020006822422146797	 
Topic: 0.020*"germani" + 0.015*"dead" + 0.015*"german" + 0.013*"abus" + 0.011*"rise" + 0.011*"earthquak" + 0.011*"turn" + 0.011*"look" + 0.010*"near" + 0.010*"photo"

Score: 0.020003536716103554	 
Topic: 0.027*"drug" + 0.019*"muslim" + 0.019*"mexico" + 0.014*"franc" + 0.013*"school" + 0.013*"court" + 0.013*"children" + 0.012*"say" + 0.011*"legal" + 0.011*"islam"

Score: 0.02000279910862446	 
Topic: 0.040*"year" + 0.031*"death" + 0.020*"prison" + 0.019*"kill" + 0.019*"iraq" + 0.017*"elect" + 0.016*"woman" + 0.015*"jail" + 0.015*"women" + 0.015*"murder"

Score: 0.020002093166

In [41]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8199640512466431	 
Topic: 0.013*"gaza" + 0.012*"china" + 0.010*"water" + 0.009*"world" + 0.009*"school" + 0.008*"billion" + 0.008*"israel" + 0.008*"die" + 0.008*"million" + 0.008*"popul"

Score: 0.02000817097723484	 
Topic: 0.026*"kill" + 0.016*"attack" + 0.014*"death" + 0.014*"korea" + 0.013*"pakistan" + 0.012*"north" + 0.012*"bomb" + 0.011*"year" + 0.009*"prison" + 0.009*"shoot"

Score: 0.02000444382429123	 
Topic: 0.010*"china" + 0.009*"news" + 0.009*"ahmadinejad" + 0.009*"dead" + 0.009*"ship" + 0.008*"protest" + 0.008*"think" + 0.008*"train" + 0.008*"internet" + 0.007*"egypt"

Score: 0.020004086196422577	 
Topic: 0.018*"drug" + 0.013*"mexico" + 0.011*"price" + 0.011*"india" + 0.008*"earthquak" + 0.008*"pope" + 0.007*"food" + 0.007*"miss" + 0.007*"world" + 0.007*"china"

Score: 0.020003946498036385	 
Topic: 0.011*"saudi" + 0.011*"world" + 0.010*"abus" + 0.010*"women" + 0.010*"minist" + 0.009*"child" + 0.008*"face" + 0.008*"prime" + 0.008*"year" + 0.007*"church"

Score: 0.0

### Test with unknown example

In [42]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.41715794801712036	 Topic: 0.072*"israel" + 0.050*"iran" + 0.035*"gaza" + 0.030*"protest" + 0.030*"isra"
Score: 0.38275665044784546	 Topic: 0.032*"polic" + 0.019*"secret" + 0.016*"forc" + 0.015*"govern" + 0.014*"chines"
Score: 0.02502170018851757	 Topic: 0.045*"world" + 0.022*"year" + 0.018*"time" + 0.016*"india" + 0.013*"minist"
Score: 0.025017810985445976	 Topic: 0.044*"north" + 0.043*"korea" + 0.040*"south" + 0.026*"unit" + 0.025*"nation"
Score: 0.025016380473971367	 Topic: 0.020*"germani" + 0.015*"dead" + 0.015*"german" + 0.013*"abus" + 0.011*"rise"
Score: 0.025013556703925133	 Topic: 0.068*"china" + 0.022*"world" + 0.019*"parti" + 0.012*"govern" + 0.012*"worker"
Score: 0.025006653741002083	 Topic: 0.036*"russian" + 0.035*"russia" + 0.025*"troop" + 0.019*"afghanistan" + 0.018*"syria"
Score: 0.025005050003528595	 Topic: 0.027*"drug" + 0.019*"muslim" + 0.019*"mexico" + 0.014*"franc" + 0.013*"school"
Score: 0.025002136826515198	 Topic: 0.040*"year" + 0.031*"death" + 0.020*"pri