In [56]:
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp
import pickle
import matplotlib.pyplot as plt

from data_exploration import process_articles
from sentiment_analysis import topic_values, article_sentiment

from bokeh.models import ColumnDataSource, OpenURL, TapTool, WheelZoomTool, HoverTool, LassoSelectTool, PanTool
from bokeh.plotting import figure, output_file, show
from bokeh.models.widgets import Panel, Tabs
from bokeh.embed import components


def make_plots(topic_dict, num_topics):
    components_dict = {topic: {'script': '', 'div': ''} for topic in range(num_topics+1)}
    for topic in range(num_topics+1):
        print(topic)
    # for topic in range(1):
        output_file("../web_app/bokeh_plots/topic"+str(topic)+".html")

        hover = HoverTool(
            tooltips=[
                ("source", "@site"),
                ("(pos,neg)", "(@pos, @neg)"),
                ("Headline", "@headline")
            ]
        )

        p = figure(plot_width=1200, plot_height=800,
                    tools=["tap, pan, wheel_zoom",hover], title="Topic: "+str(topic),
                  toolbar_location="right")

        # p.toolbar.active_drag = 'auto'

        a = 0.9
        colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0,100,0), (0, 0, 0), (100, 25, 200), (255, 255, 0), (0, 255, 255), (255, 0, 255), (128, 128, 128), (0, 0, 128), (240,230,140)]
        # sources = ['cnn', 'abc', 'fox', 'nyt', 'reuters', 'wapo', 'huffpo', 'esquire', 'rollingstone', 'cbs', '538', 'washtimes']
        sources = np.unique(topic_dict[topic]['source'])

        pos_by_site = {site: [] for site in sources}
        neg_by_site = {site: [] for site in sources}
        size_by_site = {site: [] for site in sources}
        url_by_site = {site: [] for site in sources}
        headline_by_site = {site: [] for site in sources}
        for site in sources:
            indices = [j for j, s in enumerate(topic_dict[topic]['source']) if s == site]
            if indices == []:
                pass
            else:
                pos_by_site[site] = np.array(topic_dict[topic]['pos'])[indices]
                neg_by_site[site] = np.array(topic_dict[topic]['neg'])[indices]
                size_by_site[site] = [50*topic for topic in np.array(topic_dict[topic]['topic_prob'])[indices]]
                url_by_site[site] = np.array(topic_dict[topic]['url'])[indices]
                headline_by_site[site] = np.array(topic_dict[topic]['headline'])[indices]

        for site, color in zip(sources, colors):
            source = ColumnDataSource(data=dict(
                pos=pos_by_site[site],
                neg=neg_by_site[site],
        #         color=["navy", "orange", "olive", "firebrick", "gold"],
                size=size_by_site[site],
                site=[site for i in range(len(pos_by_site[site]))],
                headline=headline_by_site[site],
                url=url_by_site[site]
            ))

            p.circle('pos', 'neg', color=color, alpha=a, size='size', source=source, legend=site)

            p.xaxis.axis_label = "Positive Sentiment"
            p.yaxis.axis_label = "Negative Sentiment"

            url = "@url"
            taptool = p.select(type=TapTool)
            taptool.callback = OpenURL(url=url)

        script, div = components(p)
        components_dict[topic]['script'] = script
        components_dict[topic]['div'] = div

    return components_dict

In [2]:
df = pd.read_csv('../data/rss_feeds_new_good.csv')
df = df[pd.notnull(df['article_text'])]

with open('../working_with_data/lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

num_topics = lda_model.num_topics

In [3]:
print('Making topics dictionary...')
topic_dict = topic_values(df, lda_model)

Making topics dictionary...
Processing Articles...
Getting Sentiment...
Getting article topics...
Creating Dictionary...


In [7]:
print('Making Plots...')
components_dict = make_plots(topic_dict, num_topics)

pickle.dump(components_dict, open('../web_app/bokeh_plots/components_dict.pkl', 'wb'))

Making Plots...


In [36]:
from wordcloud import WordCloud
def make_clouds(topic_texts, lda_model):
    plt.imshow(WordCloud(background_color="white", width=1200, height=800).generate(' '.join([' '.join(text) for text in topic_texts])), interpolation="bilinear")
    plt.axis("off")
    plt.title("Topic #0")
    plt.savefig('../web_app/static/img/wordclouds/wordcloud_topic0.png', dpi=300)
    
    for t in range(1, lda_model.num_topics+1):
        print(t)
        topic_word_probs = dict()
        lda_topics = lda_model.show_topics(num_topics=-1, num_words=100000,formatted=False)
        for word_prob in lda_topics[t-1][1]:
            topic_word_probs[word_prob[0]] = word_prob[1]
        plt.imshow(WordCloud(background_color="white", width=1200, height=800).fit_words(topic_word_probs), interpolation="bilinear")
        plt.axis("off")
        plt.title("Topic #" + str(t))
        plt.savefig('../web_app/static/img/wordclouds/wordcloud_topic'+str(t)+'.png', dpi=300)

In [11]:
print('Processing Articles...')
topic_texts, sentiment_texts = process_articles(df)

Processing Articles...
Making WordClouds...


NameError: name 'WordCloud' is not defined

In [37]:
print('Making WordClouds...')
make_clouds(topic_texts, lda_model)

Making WordClouds...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [40]:
from gensim.summarization import summarize
from gensim.summarization import keywords
from newspaper import Article
import datetime

In [41]:
def get_article(url):
    try:
        a = Article(url)
        attempt = 0
        while a.html == '' and attempt < 10:
            a = Article(url)
            a.download()
            attempt += 1
        if attempt >= 10:
            print('Article would not download!')
            return False, ()
        if a.is_downloaded:
            a.parse()
        else:
            print('Article would not download!')
            return False, ()
    except:
        return 'Article would not download!'
    try:
        headline = a.title
    except:
        return False, ()
    try:
        date_published = a.publish_date
        if date_published == '' or date_published == None:
            date_published = datetime.datetime.now()
    except:
        date_published = datetime.datetime.now()
    try:
        author = a.authors
    except:
        author = None
    try:
        article_text = a.text
    except:
        return False, ()

    return True, (article_text, headline, author, date_published)

In [42]:
def get_summary(article_text):
    '''Uses summarization function from gensim library to summarize each article'''
    summary = ''
    try:
        summary = summarize(article_text)
    except:
        summary = ''

    return summary

In [44]:
result = get_article('http://www.msnbc.com/rachel-maddow-show/trump-sets-contest-credibility-he-simply-cannot-win')

  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
  attr = HTMLParser().unescape(attr)
 

In [51]:
import warnings
# Gensim gives annoying warnings
warnings.filterwarnings('ignore')

In [88]:
url = 'http://www.breitbart.com/big-government/2017/05/26/climate-change-cohn-trump/'
result = get_article(url)
if result[0] != False:
    article_text, headline, author, date_published = result[1]
    summary = get_summary(article_text)

In [89]:
print(summary)

White House economic advisor Gary Cohn made the curious comment in Sicily on Friday that President Donald Trump’s views on climate change were “evolving.”
“Evolution” could mean moving towards the alarmist consensus of the left — or it could also mean toward a skeptical view, one more carefully informed by scientific and economic reality.
Again, that could mean Trump is growing skeptical of the alarmist, non-scientific view around which our public debate on climate change revolves.
It is also possible that Cohn was shaping his assessment of the president’s views to flatter his largely European audience, which believes in climate change the way people on other continents believe in religion.


In [68]:
def get_sentiment(word):
    mean_pos = 0
    mean_neg = 0
    mean_obj = 0
    score = 0
    bias = 0
    for similar_words in swn.senti_synsets(word):
        mean_pos += similar_words.pos_score()
        mean_neg += similar_words.neg_score()
        mean_obj += similar_words.obj_score()
    size = len(list(swn.senti_synsets(word)))
    if size != 0:
        mean_pos = mean_pos/size
        mean_neg = mean_neg/size
        mean_obj = mean_obj/size
        score = (mean_pos - mean_neg)*(1-mean_obj)
        bias = (mean_pos + mean_neg) * (1-mean_obj)
    return mean_pos, mean_neg, mean_obj

In [76]:
import nltk
from nltk.corpus import sentiwordnet as swn
def get_article_sentiment(topic_texts, sentiment_texts):

    sentiment_texts_words = set()
    for i in range(len(sentiment_texts)):
        sentiment_texts_words = sentiment_texts_words | set(sentiment_texts[i])
    sentiment_texts_words = list(sentiment_texts_words)

    relevant_types = ['JJ', 'VB', 'RB']

    s_pos = 0
    s_neg = 0
    s_obj = 0
    relevant_word_count = 0
    for word in sentiment_texts_words:
        for word, word_type in nltk.pos_tag([word]):
            if word_type in relevant_types:
                relevant_word_count += 1
                pos, neg, obj = get_sentiment(word)
                if pos == 0 and neg == 0:
                    pass
                else:
                    s_pos += pos
                    s_neg += neg
                    s_obj += obj
    if relevant_word_count != 0:
         s_pos, s_neg, s_obj = s_pos/relevant_word_count, s_neg/relevant_word_count, s_obj/relevant_word_count

    return s_pos, s_neg, s_obj

In [90]:
data = {'article_text': article_text, 'headline': headline}
df = pd.DataFrame(data, index=[0])

topic_texts, sentiment_texts = process_articles(df)

In [91]:
pos, neg, obj = get_article_sentiment(topic_texts, sentiment_texts)

In [92]:
print(pos, neg, obj)

0.1136050067216734 0.08394987049987049 0.5802229005562338


In [80]:
with open('../working_with_data/lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

In [93]:
article_bow = lda_model.id2word.doc2bow(topic_texts[0])
article_topics = lda_model[article_bow]

In [94]:
article_topics

[(4, 0.14232095324779889),
 (15, 0.37401657863751547),
 (27, 0.026002139744159675),
 (32, 0.41432412790623541),
 (38, 0.040648909704769254)]