# packages & set options

In [32]:
import numpy as np
import pandas as pd

import psycopg2
import pandas.io.sql as sqlio

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

# Load English tokenizer
nlp = spacy.load("en_core_web_sm")

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

pd.set_option('max_rows', 200)
pd.set_option('max_colwidth', 150)

# db connection, read data for each Whole Histories question

In [33]:
conn = psycopg2.connect(dbname='quasar_prod_warehouse', user='kkennovin', password='eBC$O55$Lcko', host='quasar-prod.c9ajz690mens.us-east-1.rds.amazonaws.com', port='5432')

In [34]:

sql_q1 = """select 
	northstar_id
,	text 
from 
	posts 
where 
	campaign_id = '9115' and 
	action_id = 1126
    """

sql_q2 = """select 
	northstar_id
,	text 
from 
	posts 
where 
	campaign_id = '9115' and 
	action_id = 1127
    """

sql_q3 = """select 
	northstar_id
,	text 
from 
	posts 
where 
	campaign_id = '9115' and 
	action_id = 1128
    """

In [35]:
data_q1 = sqlio.read_sql_query(sql_q1, conn)
data_q2 = sqlio.read_sql_query(sql_q2, conn)
data_q3 = sqlio.read_sql_query(sql_q3, conn)

In [36]:
conn = None

# data cleaning: make lowercase & remove punctuation

In [37]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove punctuation'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [38]:
# Let's take a look at the updated text
data_clean_q1 = pd.DataFrame(data_q1.text.apply(round1))
#data_clean_q1.sample(50)
data_clean_q2 = pd.DataFrame(data_q2.text.apply(round1))
#data_clean_q2.sample(50)
data_clean_q3 = pd.DataFrame(data_q3.text.apply(round1))
#data_clean_q3.sample(50)

# looks good! I don't see any other obvious cleaning needs

# count vectorizer & document term matrix

In [22]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words for each question
from sklearn.feature_extraction.text import CountVectorizer

cv_q1 = CountVectorizer(stop_words='english')
data_cv_q1 = cv_q1.fit_transform(data_clean_q1.text)
dtm_q1 = pd.DataFrame(data_cv_q1.toarray(), columns=cv_q1.get_feature_names())
dtm_q1.index = data_clean_q1.index

cv_q2 = CountVectorizer(stop_words='english')
data_cv_q2 = cv_q2.fit_transform(data_clean_q2.text)
dtm_q2 = pd.DataFrame(data_cv_q2.toarray(), columns=cv_q2.get_feature_names())
dtm_q2.index = data_clean_q2.index

cv_q3 = CountVectorizer(stop_words='english')
data_cv_q3 = cv_q3.fit_transform(data_clean_q3.text)
dtm_q3 = pd.DataFrame(data_cv_q3.toarray(), columns=cv_q3.get_feature_names())
dtm_q3.index = data_clean_q3.index


# add lemmatizer & remove stopwords to nlp pipeline

In [39]:
def lemmatizer(response):
    # This takes in a doc of tokens from the nlp NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    response = [token.lemma_ for token in response if token.lemma_ != '-PRON-']
    response = u' '.join(response)
    return nlp.make_doc(response)
    

In [40]:
def remove_stopwords(response):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    response = [token.text for token in response if token.is_stop != True and token.is_punct != True]
    return response


In [41]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)


In [42]:
# Filter clean data to just text column
data_clean_text_q1 = data_clean_q1['text']
data_clean_text_q2 = data_clean_q2['text']
data_clean_text_q3 = data_clean_q3['text']

# model Q1

In [17]:
word_list_q1 = []
# Iterates through each response in the corpus.
for response in tqdm(data_clean_text_q1):
    # Passes that response through the pipeline and adds result to a new list.
    pr = nlp(response)
    word_list_q1.append(pr)

In [19]:
# mapping of word IDs to words.
words_q1 = corpora.Dictionary(word_list_q1)

# Turns each document into a bag of words.
corpus_q1 = [words_q1.doc2bow(response) for response in word_list_q1]

In [20]:
# run latent dirichlet allocation model
lda_model_q1 = gensim.models.ldamodel.LdaModel(corpus=corpus_q1,
                                           id2word=words_q1,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
# view topic results
pprint(lda_model_q1.print_topics(num_topics=-1, num_words=10))

[(0,
  '0.050*"learn" + 0.048*"black" + 0.036*"social" + 0.035*"medium" + '
  '0.033*"event" + 0.024*"massacre" + 0.024*"tulsa" + 0.022*"park" + '
  '0.021*"rosa" + 0.015*"wish"'),
 (1,
  '0.107*"right" + 0.105*"civil" + 0.101*"movement" + 0.046*"x" + '
  '0.042*"learn" + 0.034*"malcolm" + 0.026*"war" + 0.023*"figure" + '
  '0.016*"fight" + 0.013*"malcom"'),
 (2,
  '0.035*"slave" + 0.034*"columbus" + 0.032*"native" + 0.029*"america" + '
  '0.022*"christopher" + 0.019*"free" + 0.019*"indigenous" + 0.019*"land" + '
  '0.019*"harriet" + 0.017*"tubman"'),
 (3,
  '0.085*"black" + 0.024*"panther" + 0.019*"learn" + 0.016*"watch" + '
  '0.015*"month" + 0.015*"party" + 0.014*"documentary" + 0.013*"community" + '
  '0.011*"movie" + 0.010*"tuskegee"'),
 (4,
  '0.057*"racism" + 0.035*"learn" + 0.018*"race" + 0.018*"wish" + '
  '0.015*"people" + 0.014*"today" + 0.013*"racial" + 0.013*"issue" + '
  '0.012*"social" + 0.010*"medium"'),
 (5,
  '0.054*"woman" + 0.024*"role" + 0.020*"african" + 0.020*"am

# model Q2

In [24]:
word_list_q2 = []
# Iterates through each response in the corpus.
for response in tqdm(data_clean_text_q2):
    # Passes that response through the pipeline and adds result to a new list.
    pr = nlp(response)
    word_list_q2.append(pr)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=19133.0), HTML(value='')))




In [47]:
# mapping of word IDs to words.
words_q2 = corpora.Dictionary(word_list_q2)

# Turns each document into a bag of words.
corpus_q2 = [words_q2.doc2bow(response) for response in word_list_q2]

In [48]:
# run latent dirichlet allocation model
lda_model_q2 = gensim.models.ldamodel.LdaModel(corpus=corpus_q2,
                                           id2word=words_q2,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [49]:
# view topic results
pprint(lda_model_q2.print_topics(num_topics=-1, num_words=10))

[(0,
  '0.056*"racism" + 0.055*"teach" + 0.049*"school" + 0.042*"history" + '
  '0.037*"good" + 0.036*"job" + 0.035*"teacher" + 0.029*"race" + '
  '0.024*"people" + 0.021*"way"'),
 (1,
  '0.046*"teacher" + 0.032*"class" + 0.023*"student" + 0.016*"book" + '
  '0.016*"read" + 0.015*"event" + 0.015*"topic" + 0.014*"help" + '
  '0.013*"racism" + 0.013*"understand"'),
 (2,
  '0.042*"united" + 0.042*"states" + 0.038*"course" + 0.032*"college" + '
  '0.024*"professor" + 0.014*"diversity" + 0.014*"ap" + 0.013*"american" + '
  '0.012*"class" + 0.012*"education"'),
 (3,
  '0.024*"holocaust" + 0.017*"enjoy" + 0.014*"field" + 0.013*"informative" + '
  '0.012*"trip" + 0.012*"chapter" + 0.011*"frederick" + 0.010*"museum" + '
  '0.010*"douglass" + 0.009*"speaker"'),
 (4,
  '0.047*"study" + 0.028*"woman" + 0.018*"black" + 0.017*"club" + '
  '0.016*"program" + 0.010*"social" + 0.010*"offer" + 0.010*"man" + '
  '0.010*"american" + 0.010*"class"'),
 (5,
  '0.113*"civil" + 0.112*"right" + 0.084*"movement"

# model Q3

In [50]:
word_list_q3 = []
# Iterates through each response in the corpus.
for response in tqdm(data_clean_text_q3):
    # Passes that response through the pipeline and adds result to a new list.
    pr = nlp(response)
    word_list_q3.append(pr)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=19133.0), HTML(value='')))




In [51]:
# mapping of word IDs to words.
words_q3 = corpora.Dictionary(word_list_q3)

# Turns each document into a bag of words.
corpus_q3 = [words_q3.doc2bow(response) for response in word_list_q3]

In [52]:
# run latent dirichlet allocation model
lda_model_q3 = gensim.models.ldamodel.LdaModel(corpus=corpus_q3,
                                           id2word=words_q3,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [53]:
# view topic results
pprint(lda_model_q3.print_topics(num_topics=-1, num_words=10))

[(0,
  '0.044*"  " + 0.026*"story" + 0.017*"hear" + 0.011*"documentary" + 0.010*"\n'
  ' " + 0.009*"accountable" + 0.009*"latinos" + 0.009*"hold" + '
  '0.008*"differently" + 0.007*"factual"'),
 (1,
  '0.095*"right" + 0.075*"civil" + 0.064*"movement" + 0.054*"figure" + '
  '0.033*"black" + 0.032*"war" + 0.026*"historical" + 0.020*"slavery" + '
  '0.011*"fight" + 0.011*"leader"'),
 (2,
  '0.058*"history" + 0.042*"truth" + 0.038*"tell" + 0.026*"textbook" + '
  '0.021*"story" + 0.020*"need" + 0.018*"want" + 0.015*"happen" + 0.015*"know" '
  '+ 0.015*"stop"'),
 (3,
  '0.033*"slave" + 0.032*"columbus" + 0.021*"christopher" + 0.014*"grade" + '
  '0.012*"native" + 0.011*"trade" + 0.010*"perfect" + 0.009*"genocide" + '
  '0.008*"century" + 0.008*"middle"'),
 (4,
  '0.088*"people" + 0.076*"white" + 0.046*"color" + 0.042*"book" + '
  '0.035*"black" + 0.028*"history" + 0.021*"perspective" + 0.021*"write" + '
  '0.019*"textbook" + 0.015*"read"'),
 (5,
  '0.056*"racism" + 0.039*"like" + 0.035*"scho

In [56]:
ent_list_q3 = []
# Iterates through each response in the corpus.
for response in tqdm(data_clean_text_q3):
    # Passes that response through the pipeline and adds result to a new list.
    pr = nlp(response)
    for ent in pr.ents:
      word_list_q3.append(ent.text, ent.label_)

# for ent in corpus_q3.ents:
#     print(ent.text, ent.label_)

# this section doesn't work because of pipes added to original nlp

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=19133.0), HTML(value='')))

AttributeError: ignored

In [None]:
# how to refine and improve:

# medium or large english tokenizer (nlp = spacy.load("en_core_web_md") or nlp = spacy.load("en_core_web_lg"))
# td-idf
# n-grams
# more or less topics
# more passes in model
# identifying named entities