In [110]:
import gensim
import json
import operator
import collections

def sort_dict(dictionary, by_value=False, reverse=False):
    index = 1 if by_value else 0
    sorted_list = sorted(dictionary.items(), key=lambda kv: kv[index], reverse=reverse)
    sorted_dict = collections.OrderedDict(sorted_list)
    return sorted_dict

import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from urlextract import URLExtract
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
extractor = URLExtract()

def clean_text(text, lemmatize=False, stem=False):
    # lower case
    text = text.lower()

    # removes URLs
    urls = set(extractor.find_urls(text))
    text = ' '.join([t for t in text.split(' ') if t not in urls])

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove punctuations
    for p in "=$“”%.,!?:;\"'_-~|&[]#*()’<>/\\":
        text = text.replace(p,' ')

    # remove '\n', '\t'
    text = text.replace('\n', '')
    text = text.replace('\t', '')
    text = ' '.join(text.split())
    text_tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    tokens = [i for i in text_tokens if i not in stop_words]

    # lemmatize tokens
    if lemmatize:
        tokens = [lemmatizer.lemmatize(i) for i in tokens]

    # stemming tokens
    if stem:
        tokens = [stemmer.stem(i) for i in tokens]

    tokens = [i for i in tokens if i not in stop_words]
    return text, tokens

def pland_clean(tokens):
    custom = ["www", "http", "le", "https", "com", "said", "would", "people", "u", "like", "r"]
    custom += ["thing", "think", "one", "know", "say", "well", "deleted", "really", "reddit"]
    custom += ["comment", "please", "yes", "going", "get", "yeah", "read", "link", "also", "could"]
    custom += ["getting", "got", "ok", "lol", "exactly", "oh", "gon", "na", "want", "make", "take", "removed"]
    custom += ["someone", "anything", "someone", "im", "many", "even", "much", "anyone", "way", "go"]
    custom += ["saying","something","anywhere", "actually", "guy", "kid", "point", "see", "country"]
    custom += ["talking", "nothing", "year", "let", "every", "any", "mean", "keep", "never", "meeting"]
    custom += ["maybe", "news", "lot", "en"]
    custom = set(custom)
    tokens = [i for i in tokens if i not in custom]
    return tokens

def first(sorted_dict):
    return next(iter(sorted_dict))

In [111]:
with open('./data/final_indico.json') as pland_file:
    pland = json.load(pland_file)

post_replies = list()
comment_queue = list()
# comment_queue.append('t3_87vnah')
for post in pland['posts']:
    comment_queue.append(post['fullname'])
while comment_queue:
    comment_fullname = comment_queue.pop(0)
    post_replies.append(comment_fullname)
    
    for reply_fn in pland['comments'][comment_fullname]['replies']:
        comment_queue.append(reply_fn)

In [112]:
comp_tokens = list()
for fn in post_replies:
    comp = pland['comments'][fn]['body']
    cleaned, tokens = clean_text(comp, lemmatize=True, stem=False)
    tokens = pland_clean(tokens)
    comp_tokens.append(tokens)

In [113]:
# generate dictionary and corpus
from gensim import corpora
# dictionary = corpora.Dictionary(comp_tokens)
# corpus = [dictionary.doc2bow(token) for token in comp_tokens]

# # remove tokens that only appeared once and more than 
# updated_comp_tokens = list()
# lower = 1
# upper = 2800
# min_token_len = 10
# outliers = set([dictionary[k] for k in dictionary.dfs if dictionary.dfs[k] <= lower or dictionary.dfs[k] >= upper])
# for tokens in comp_tokens:
#     if len(tokens) < min_token_len:
#         continue
#     tokens = [token for token in tokens if token not in outliers]
#     updated_comp_tokens.append(tokens)

# dictionary = corpora.Dictionary(updated_comp_tokens)
# corpus = [dictionary.doc2bow(token) for token in updated_comp_tokens]

import pickle
# pickle.dump(corpus, open('corpus.pkl', 'wb'))
# dictionary.save('dictionary.gensim')

In [114]:
lda = gensim.models.ldamodel.LdaModel.load('model8_0.gensim')

In [115]:
topics = lda.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.018*"amendment" + 0.009*"state" + 0.009*"second" + 0.008*"militia" + 0.007*"government" + 0.007*"constitution"')
(1, '0.015*"nra" + 0.007*"control" + 0.007*"democrat" + 0.006*"owner" + 0.006*"support" + 0.006*"republican"')
(2, '0.010*"police" + 0.006*"fbi" + 0.005*"good" + 0.005*"job" + 0.005*"person" + 0.005*"stop"')
(3, '0.014*"mental" + 0.011*"problem" + 0.011*"health" + 0.008*"issue" + 0.008*"mass" + 0.007*"still"')
(4, '0.013*"government" + 0.007*"free" + 0.005*"life" + 0.005*"problem" + 0.005*"freedom" + 0.004*"society"')
(5, '0.013*"firearm" + 0.009*"check" + 0.008*"crime" + 0.007*"background" + 0.007*"source" + 0.007*"violence"')
(6, '0.027*"rifle" + 0.023*"weapon" + 0.019*"ar" + 0.016*"assault" + 0.013*"semi" + 0.012*"automatic"')
(7, '0.018*"trump" + 0.008*"shit" + 0.007*"woman" + 0.005*"fucking" + 0.005*"post" + 0.005*"bad"')


In [136]:
len(dictionary)

18859

In [139]:
# Generate the graph
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
# import pyLDAvis.gensim
# lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
# pyLDAvis.display(lda_display)

In [116]:
def get_topic_no(fullname):
    new_doc = pland['comments'][fullname]['body']
    cleaned, tokens = clean_text(new_doc, lemmatize=True, stem=False)
    tokens = pland_clean(tokens)
    bow = dictionary.doc2bow(tokens)
    bow_dict = dict()
    for key, value in lda.get_document_topics(bow):
        bow_dict[key] = value
    return first(sort_dict(bow_dict, by_value=True, reverse=True))

In [142]:
topic_dict = dict()
NUM_TOPICS = 8
for i in range(0, NUM_TOPICS):
    topic_dict[i] = set()
    
for fullname in post_replies:
    try:
        topic_no = get_topic_no(fullname)
    except:
        topic_no = -1
    topic_dict[topic_no].add(fullname)

In [140]:
new_doc = pland['comments']['t1_dubovyu']['body']
cleaned, tokens = clean_text(new_doc, lemmatize=True, stem=False)
tokens = pland_clean(tokens)
bow = dictionary.doc2bow(tokens)
bow_dict = dict()
for key, value in lda.get_document_topics(bow):
    bow_dict[key] = value

In [157]:
fullname_topic = dict()
for i in range(0, NUM_TOPICS):
    for fullname in topic_dict[i]:
        fullname_topic[fullname] = i

In [188]:
matrix_count = [[0] * NUM_TOPICS for i in range(NUM_TOPICS)]

for i in range(NUM_TOPICS):
    for fullname in topic_dict[i]:
        children = pland['comments'][fullname]['replies']
        if len(children) == 0:
            continue
        for child in children:
            child_no = fullname_topic[child]
            matrix_count[i][child_no] += 1

In [189]:
import pandas
topic_df = pandas.DataFrame(matrix_count, columns=["0","1","2","3","4","5","6","7"])
topic_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,2862,1548,1262,941,716,1325,673,670
1,1463,2755,1107,854,608,867,407,727
2,904,1002,2023,862,560,682,467,519
3,816,896,889,1513,476,642,308,403
4,540,500,496,379,645,340,193,260
5,1137,767,769,620,398,2150,453,316
6,567,370,457,294,215,512,1111,188
7,624,862,813,610,545,329,221,1053


In [195]:
matrix_perc = [i[:] for i in matrix_count]
for i in range(NUM_TOPICS):
    matrix_perc[i] = [k/sum(matrix_count[i]) for k in matrix_perc[i]]
topic_perc_df = pandas.DataFrame(matrix_perc, columns=["0","1","2","3","4","5","6","7"])

In [215]:
import plotly.offline as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

trace = go.Heatmap(z=matrix_perc,
                   x=["0","1","2","3","4","5","6","7"],
                   y=["0","1","2","3","4","5","6","7"])
data=[trace]
py.iplot(data, filename='labelled-heatmap')

In [217]:
# Generate the graph
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





#### 0: 2nd Amendment
amendment, state, second, militia, government, constitution

#### 1: NRA
nra, control, democrat, owner, support, republican

#### 2: Police
police, fbi, goiod, job, person, stop

#### 3: Mental Profile
mental, problem, health, issue, mass, still

#### 4: Freedom
government, free, life, problem, freedom, society

#### 5: Suggestion
firearm, check, crime, background, source, violence

#### 6: Weapon
rifle, weapon, ar, assault, semi, automatic

#### 7: Rant(Negative)
trump, shit, woman, fucking, post, bad

Bright's new stuff

In [225]:
def get_topics_nos(fullname):
    new_doc = pland['comments'][fullname]['body']
    cleaned, tokens = clean_text(new_doc, lemmatize=True, stem=False)
    tokens = pland_clean(tokens)
    bow = dictionary.doc2bow(tokens)
    bow_dict = dict()
    for key, value in lda.get_document_topics(bow):
        bow_dict[key] = value
    return sort_dict(bow_dict, by_value=True, reverse=True)

In [283]:
topic_dict[1]

{'t1_dwgiwhy',
 't1_dwg4105',
 't1_dubskl9',
 't1_e9lu6pl',
 't1_dwhpnv2',
 't1_dwg4lrt',
 't1_e9l8onl',
 't1_dutxrjk',
 't1_dwgigky',
 't1_duaew6r',
 't1_dwgbrio',
 't1_dutsxqo',
 't1_e9lbijr',
 't1_dwhwo6l',
 't1_dwh5z2u',
 't1_e6tvxir',
 't1_e6tzfur',
 't1_du9rzkz',
 't1_dwghcxv',
 't1_dwg7yqo',
 't1_dwi6dqp',
 't1_dwgiqy9',
 't1_dwgjv0g',
 't1_e9lw4pz',
 't1_dwgi0dg',
 't1_dwge6sy',
 't1_dubl1pc',
 't1_e9kpidx',
 't1_du9q10w',
 't1_duwaj0d',
 't1_dutp0ar',
 't1_du9mo5h',
 't1_dutsb6y',
 't1_dwii74s',
 't1_e6u2dp5',
 't1_e6u0lpw',
 't1_dwg8661',
 't1_e9kq3yy',
 't1_du9yo0w',
 't1_duc0oji',
 't1_due7ni0',
 't1_dwhefkx',
 't1_e9lhzeg',
 't1_du9mijn',
 't1_e9kupjt',
 't1_dwgweup',
 't1_dwgb5k7',
 't1_e9l220m',
 't1_dwgg7jx',
 't1_dubr1nd',
 't1_dwghet3',
 't1_dutyskc',
 't1_duabg1a',
 't1_dwkpwf1',
 't1_du9zz93',
 't1_dwguluf',
 't1_dwgdy68',
 't1_dwggvas',
 't1_du9ztge',
 't1_dubsuf1',
 't1_dwgupsm',
 't1_e6tx4r4',
 't1_dwg61ep',
 't1_dubgt8k',
 't1_dvuuovl',
 't1_duak117',
 't1_dwi74

In [289]:
get_topics_nos('t1_dutp0ar')

OrderedDict([(1, 0.8538529),
             (2, 0.020922061),
             (5, 0.020894317),
             (6, 0.02088418),
             (3, 0.020865364),
             (0, 0.020864611),
             (4, 0.020863859),
             (7, 0.02085266)])

In [299]:
pland['comments']['t1_dz3qzdp']['body']

'deleted'

In [290]:
topic_0_example_id = 't1_dzblfco'

In [316]:
topic_0_sentiment = [round(pland['comments'][fname]['sentiment'], 2) for fname in topic_dict[0] if 'sentiment' in pland['comments'][fname]]

In [323]:
topic_0_sentiment_freq = [topic_0_sentiment.count(i/100.0) for i in range(0, 101)]

In [372]:
sentiment_heatmap = list()
for i in range(0, 8):
    topic_sentiment = [round(pland['comments'][fname]['sentiment'], 1) for fname in topic_dict[i] if 'sentiment' in pland['comments'][fname]]
    topic_sentiment_freq = [topic_sentiment.count(i/10.0) for i in range(0, 11)]
#     sentiment_heatmap.append(topic_sentiment_freq)
    curr_sum = sum(topic_sentiment_freq)
    sentiment_heatmap.append([i/curr_sum for i in topic_sentiment_freq])

In [373]:
sentiment_trace = go.Heatmap(z=sentiment_heatmap, name="heatmap", colorscale='Blues')
sentiment_data=[sentiment_trace]

sent_layout = go.Layout(
    title='Topic vs. Sentiment',
)

sent_fig = go.Figure(data=sentiment_data, layout=sent_layout)

py.iplot(sent_fig, filename='labelled-heatmap')