In [1]:
import pandas as pd
import gensim.corpora as corpora
import spacy
import gensim

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../abortionNews.csv")

In [4]:
import re
df['content'] = df['content'].map(lambda x: re.sub('[,\.!?—]', '', x))
df['content'] = df['content'].map(lambda x: re.sub('\s+', ' ', x))
df['content'] = df['content'].map(lambda x: x.lower())

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data_words = list(sent_to_words(df['content']))

In [6]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [9]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [11]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
from gensim.models import CoherenceModel
max_number_of_topics=20
max_coherence = {"num": 0, "coherence_score": 0, "model": None}
for topic_number in range(1, max_number_of_topics):
    print(topic_number)
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=topic_number)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    if(max_coherence["coherence_score"] < coherence_lda):
        max_coherence["coherence_score"] = coherence_lda
        max_coherence["num"] = topic_number
        max_coherence["model"] = lda_model

print(max_coherence)

1


In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=16)

In [None]:
# number of topics
lda_model = max_coherence["model"]

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

In [35]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [37]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df['content'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,13,0.9952,"trump, say, make, woman, abortion, go, people,...",donald j trump arrived at 1 world trade cente...
1,1,12,0.4947,"say, trump, woman, go, people, abortion, make,...",beijing a few months after lu qiumei gave birt...
2,2,13,0.9012,"trump, say, make, woman, abortion, go, people,...",good morning here’s what you need to know: • t...
3,3,7,0.3733,"say, abortion, woman, trump, state, make, peop...",dublin an assembly of irish citizens convened ...
4,4,9,0.3251,"say, trump, woman, abortion, people, year, sup...",many thousands of women are expected to conver...
...,...,...,...,...,...
3134,3134,12,0.4024,"say, trump, woman, go, people, abortion, make,...",john glenn who captured the nation’s attentio...
3135,3135,7,0.9960,"say, abortion, woman, trump, state, make, peop...",ohio gov john kasich on tuesday vetoed a cont...
3136,3136,12,0.8310,"say, trump, woman, go, people, abortion, make,...",warsaw the law and justice party rode to powe...
3137,3137,7,0.5267,"say, abortion, woman, trump, state, make, peop...",we’re now a month away from the inauguration ...


In [38]:
lda_model.print_topics()

[(0,
  '0.018*"say" + 0.012*"abortion" + 0.012*"woman" + 0.007*"trump" + 0.006*"law" + 0.005*"get" + 0.005*"right" + 0.005*"go" + 0.005*"court" + 0.004*"people"'),
 (1,
  '0.019*"say" + 0.016*"trump" + 0.007*"woman" + 0.007*"people" + 0.006*"make" + 0.004*"go" + 0.004*"abortion" + 0.004*"state" + 0.004*"right" + 0.004*"life"'),
 (2,
  '0.017*"say" + 0.012*"abortion" + 0.007*"woman" + 0.007*"trump" + 0.006*"people" + 0.005*"go" + 0.005*"also" + 0.005*"make" + 0.005*"year" + 0.005*"state"'),
 (3,
  '0.013*"say" + 0.012*"trump" + 0.006*"people" + 0.005*"make" + 0.005*"get" + 0.005*"go" + 0.005*"state" + 0.005*"even" + 0.005*"woman" + 0.004*"know"'),
 (4,
  '0.015*"say" + 0.011*"trump" + 0.008*"woman" + 0.007*"abortion" + 0.006*"go" + 0.006*"make" + 0.005*"people" + 0.005*"get" + 0.005*"want" + 0.004*"take"'),
 (5,
  '0.017*"say" + 0.011*"trump" + 0.011*"abortion" + 0.007*"woman" + 0.005*"get" + 0.005*"go" + 0.005*"right" + 0.005*"people" + 0.004*"year" + 0.004*"law"'),
 (6,
  '0.023*"say"

In [40]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.9988,"say, abortion, woman, trump, law, get, right, ...",i want to receive updates from partners and s...
1,1,0.9971,"say, trump, woman, people, make, go, abortion,...",organizers for the upcoming 2016 tribeca film...
2,2,0.9983,"say, abortion, woman, trump, people, go, also,...",jones beach state park in new york becomes ho...
3,3,0.9934,"say, trump, people, make, get, go, state, even...",comedian margaret cho angered an audience in n...
4,4,0.9973,"say, trump, woman, abortion, go, make, people,...",the reports of punk rock’s resurrection are gr...
5,5,0.9976,"say, trump, abortion, woman, get, go, right, p...",last week reverend dennis h holtschneider c m ...
6,6,0.9976,"say, trump, woman, go, abortion, take, think, ...",a newly released video from center for medical...
7,7,0.9986,"say, abortion, woman, trump, state, make, peop...",days after the women’s march on washington th...
8,8,0.998,"say, trump, woman, people, abortion, time, als...",the craziest thing about the “pizzagate” story...
9,9,0.9981,"say, trump, woman, abortion, people, year, sup...",surveying some of the sweeping social changes ...


In [46]:
sent_topics_sorteddf_mallet["Text"][7]

' days after the women’s march on washington the trump administration and republicans in congress launched attacks on public funding for women’s health care on monday president trump reinstated the global gag rule which strips all us funding from foreign aid groups that counsel patients on abortion rather than decrease abortion rates the move is likely to increase them devastate the global health system in the process and leave millions of women across the globe without access to either safe abortion or contraception and on tuesday afternoon house republicans voted to permanently ban us women too from receiving any federal financial assistance for abortion whether they are insured through public programs like medicaid or if they purchase private health insurance on the affordable care act exchanges the bill would codify existing restrictions that make abortion harder for poor women to get the text of the bill hr 7 introduced by rep christopher smith ( ) states that its purpose is to “p

In [48]:
df_dominant_topic[df_dominant_topic["Dominant_Topic"] == 7]

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
3,3,7,0.3733,"say, abortion, woman, trump, state, make, peop...",dublin an assembly of irish citizens convened ...
39,39,7,0.4093,"say, abortion, woman, trump, state, make, peop...",when republicans in kentucky seized total cont...
52,52,7,0.3287,"say, abortion, woman, trump, state, make, peop...",the white house concerned about the possible p...
64,64,7,0.9162,"say, abortion, woman, trump, state, make, peop...",one of the biggest american public health vict...
65,65,7,0.8560,"say, abortion, woman, trump, state, make, peop...",a doctor who performs abortions at a hospital ...
...,...,...,...,...,...
3067,3067,7,0.3652,"say, abortion, woman, trump, state, make, peop...",sen timothy m kaine ( ) is one of a few peopl...
3125,3125,7,0.7434,"say, abortion, woman, trump, state, make, peop...",while it’s pretty much a given that the affor...
3133,3133,7,0.8134,"say, abortion, woman, trump, state, make, peop...",ohio lawmakers passed a bill late tuesday tha...
3135,3135,7,0.9960,"say, abortion, woman, trump, state, make, peop...",ohio gov john kasich on tuesday vetoed a cont...


In [50]:
df_dominant_topic["Text"][3]

