In [151]:
import json
import gensim

In [4]:
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from urlextract import URLExtract
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
extractor = URLExtract()

def clean_text(text, lemmatize=False, stem=False):
    # lower case
    text = text.lower()
    
    # removes URLs
    urls = set(extractor.find_urls(text))
    text = ' '.join([t for t in text.split(' ') if t not in urls])
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    # remove punctuations
    for p in "=$“”%.,!?:;\"'_-~|&[]#*()’<>/\\":
        text = text.replace(p,' ')
    
    # remove '\n', '\t'
    text = text.replace('\n', '')
    text = text.replace('\t', '')
    text = ' '.join(text.split())
    text_tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    tokens = [i for i in text_tokens if i not in stop_words]
    
    # lemmatize tokens
    if lemmatize:
        tokens = [lemmatizer.lemmatize(i) for i in tokens]
        
    # stemming tokens
    if stem:
        tokens = [stemmer.stem(i) for i in tokens]
    
    tokens = [i for i in tokens if i not in stop_words]
    return text, tokens

def pland_clean(tokens):
    custom = ["www", "http", "le", "https", "com", "said", "would", "people", "u", "like", "r"]
    custom += ["thing", "think", "one", "know", "say", "well", "deleted", "really", "reddit"]
    custom += ["comment", "please", "yes", "going", "get", "yeah", "read", "link", "also", "could"]
    custom += ["getting", "got", "ok", "lol", "exactly", "oh", "gon", "na", "want", "make", "take", "removed"]
    custom += ["someone", "anything", "someone", "im", "many", "even", "much", "anyone", "way", "go"]
    custom += ["saying","something","anywhere", "actually", "guy", "kid", "point", "see", "country"]
    custom += ["talking", "nothing", "year", "let", "every", "any", "mean", "keep", "never", "meeting"]
    custom += ["maybe", "news", "lot", "en"]
    custom = set(custom)
    tokens = [i for i in tokens if i not in custom]
    return tokens

In [5]:
# loads json
with open('./data/parkland_ext.json') as pland_file:
    pland = json.load(pland_file)

In [309]:
# BFS out on a particular post
post_replies = list()
comment_queue = list()
# comment_queue.append('t3_87vnah')
for post in pland['posts']:
    comment_queue.append(post['fullname'])
while comment_queue:
    comment_fullname = comment_queue.pop(0)
    post_replies.append(comment_fullname)
    
    for reply_fn in pland['comments'][comment_fullname]['replies']:
        comment_queue.append(reply_fn)

In [311]:
comp_tokens = list()
for fn in post_replies:
    comp = pland['comments'][fn]['body']
    cleaned, tokens = clean_text(comp, lemmatize=True, stem=False)
    
    # just nouns
#     tokens = word_tokenize(cleaned)
#     tokens = [k[0] for k in nltk.pos_tag(tokens) if k[1] == 'NN' or k[1] == 'NNS']
#     stop_words = set(stopwords.words('english'))
#     tokens = [i for i in tokens if i not in stop_words]
#     tokens = [lemmatizer.lemmatize(i) for i in tokens]
    
    tokens = pland_clean(tokens)
    comp_tokens.append(tokens)

In [312]:
from gensim import corpora
dictionary = corpora.Dictionary(comp_tokens)
corpus = [dictionary.doc2bow(token) for token in comp_tokens]

# remove tokens that only appeared once and more than 
updated_comp_tokens = list()
lower = 1
upper = 2800
min_token_len = 10
outliers = set([dictionary[k] for k in dictionary.dfs if dictionary.dfs[k] <= lower or dictionary.dfs[k] >= upper])
for tokens in comp_tokens:
    if len(tokens) < min_token_len:
        continue
    tokens = [token for token in tokens if token not in outliers]
    updated_comp_tokens.append(tokens)

dictionary = corpora.Dictionary(updated_comp_tokens)
corpus = [dictionary.doc2bow(token) for token in updated_comp_tokens]

import pickle
# pickle.dump(corpus, open('corpus.pkl', 'wb'))
# dictionary.save('dictionary.gensim')

In [313]:
len(updated_comp_tokens)

25916

In [315]:
# Latent Dirchlet Allocation
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=1)

In [320]:
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.019*"amendment" + 0.011*"militia" + 0.008*"constitution" + 0.008*"second" + 0.008*"regulated" + 0.006*"first"')
(1, '0.014*"police" + 0.010*"shooter" + 0.007*"cop" + 0.006*"job" + 0.005*"armed" + 0.005*"shot"')
(2, '0.031*"nra" + 0.018*"source" + 0.010*"wiki" + 0.008*"org" + 0.008*"member" + 0.007*"rule"')
(3, '0.013*"firearm" + 0.010*"check" + 0.010*"state" + 0.008*"background" + 0.007*"weapon" + 0.007*"violence"')
(4, '0.010*"control" + 0.010*"owner" + 0.009*"firearm" + 0.008*"crime" + 0.008*"issue" + 0.008*"buy"')
(5, '0.013*"trump" + 0.009*"party" + 0.008*"republican" + 0.008*"vote" + 0.007*"shit" + 0.006*"democrat"')
(6, '0.010*"mental" + 0.010*"crime" + 0.009*"rate" + 0.008*"court" + 0.008*"health" + 0.008*"violence"')
(7, '0.013*"weapon" + 0.011*"government" + 0.011*"rifle" + 0.008*"assault" + 0.008*"ban" + 0.007*"ar"')


In [305]:
# ldamodel.save('model8_0.gensim')

In [287]:
new_doc = "  doesnt make them a cult fairly certain i said they werent a cult  fairly certain  mental impairments included people who needed help balancing a check book or paying their bills online  that isnt enough justification to deprive people of their constitutional rights  they are recieving government assistance through social security because they have hard enough mental impairments that they cant do something like balance a checkbook or pay their bills online  these are menial tasks that you would expect any adult to be able to do so if you need government assistance to do this again maybe you shouldnt own a gun  the cdc was politicized in the s which lead to a ban on research with the goal of pushing gun control they werent politicized its just the research they were doing lead to a presumption that guns should be more regulated especially when it comes to people with mental illnesses operating them  you cant spin stopping people with mental illnesses from owning and operating guns as a bad thing  look at newtown you want that to happen again lanza had severe mental issues and while he himself didnt own guns his mother did  he had ready access to them   if the nra is standing up for rights and you stand against the nra then what does that make youdo you know what the no true scotsman argument is its when you appeal to purity or in this case constitutional authority as a way to dismiss valid criticism  the nra stands against reforming the a to stop mass shootings using the argument that the big gov wants to come and take away your guns  no ones saying that im not saying that that hasnt been said in this conversation  what people are saying however is that the original authors of the a could not have foreseen a country so armed and with weapons so powerful  the constitution is a living document and at times this requires rethinking certain aspects of it so that we as a country can better move forward  people want tighter regulations on owning guns so that people like adam lanza could not have ready access to them so that cruz could not have gone on a rampage that we saw coming miles away  we require a system that can foresee threats and stop them before they become another tragedy i e  the cdc researching gun violence and mental illness"
cleaned, tokens = clean_text(new_doc, lemmatize=True, stem=False)
tokens = pland_clean(tokens)
bow = dictionary.doc2bow(tokens)
bow_dict = dict()
for key, value in ldamodel.get_document_topics(bow):
    bow_dict[key] = value
sort_dict(bow_dict, by_value=True, reverse=True)

OrderedDict([(2, 0.56327313),
             (0, 0.13315608),
             (4, 0.112918414),
             (1, 0.08654252),
             (3, 0.070896775),
             (5, 0.0332131)])

In [321]:
ldamodel.num_terms 

17789

In [322]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')