In [362]:
%load_ext autoreload
%autoreload 2
import sys
from os import listdir
import csv
import pprint
import pickle
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
from plotly import tools
from helpers import functions
from sklearn.metrics import pairwise_distances

plotly.offline.init_notebook_mode(connected=True)
csv.field_size_limit(sys.maxsize)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


9223372036854775807

### Load the data:

In [2]:
bbc_no_dupes_dir = 'data/processed/2019_03_27_BBC_NoDupesWithin.csv'
bbc_no_dupesatall_dir = 'data/processed/2019_03_27_BBC_NoDupesAtAll.csv'
fox_no_dupes_dir = 'data/processed/2019_03_27_FOX_NoDupesWithin.csv'
fox_no_dupesatall_dir = 'data/processed/2019_03_27_FOX_NoDupesAtAll.csv'
cnn_no_dupes_dir = 'data/processed/2019_03_28_CNN_NoDupesWithin.csv'
cnn_no_dupesatall_dir = 'data/processed/2019_03_28_CNN_NoDupesAtAll.csv'
rt_no_dupes_dir = 'data/processed/2019_03_26_RT_NoDupesWithin.csv'
rt_no_dupesatall_dir = 'data/processed/2019_03_26_RT_NoDupesAtAll.csv'

In [3]:
cols_to_keep = ['source',
                'paper_section_name',
                'source_url', 
                'url',
                'canonical_link',
                'rss_link',
                'title',
                'rss_title',
                'text',
                'paper_section_name',
                'pull_date']

BBC:

In [4]:
%%time
df_bbc_clean_nodupes_csv = pd.read_csv(bbc_no_dupes_dir)

CPU times: user 9.16 s, sys: 1.48 s, total: 10.6 s
Wall time: 10.7 s


In [5]:
%%time
df_bbc_clean_nodupes_atall_csv = pd.read_csv(bbc_no_dupesatall_dir)

CPU times: user 7.28 s, sys: 1.3 s, total: 8.58 s
Wall time: 8.68 s


Fox News:

In [6]:
%%time
df_fox_clean_nodupes_csv = pd.read_csv(fox_no_dupes_dir)

CPU times: user 3.31 s, sys: 514 ms, total: 3.83 s
Wall time: 3.84 s


In [7]:
%%time
df_fox_clean_nodupes_atall_csv = pd.read_csv(fox_no_dupesatall_dir)

CPU times: user 3.36 s, sys: 860 ms, total: 4.22 s
Wall time: 4.28 s


RT:

In [8]:
%%time
df_rt_clean_nodupes_csv = pd.read_csv(rt_no_dupes_dir)

CPU times: user 4.72 s, sys: 1.51 s, total: 6.23 s
Wall time: 6.46 s


In [9]:
%%time
df_rt_clean_nodupes_atall_csv = pd.read_csv(rt_no_dupesatall_dir)

CPU times: user 3.21 s, sys: 654 ms, total: 3.87 s
Wall time: 3.9 s


Loading CNN csv takes too long and they are too bulky. Let's use the 'csv' module instead:

In [10]:
# %%time
# df_cnn_clean_nodupes_csv = pd.read_csv(cnn_no_dupes_dir, names=['text'])

In [11]:
# %%time
# df_cnn_clean_nodupes_atall_csv = pd.read_csv(cnn_no_dupesatall_dir, names=cols_to_keep)

In [12]:
%%time
cnn_clean_nodupes_texts = []
with open(cnn_no_dupes_dir) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:  # Skip header
            cnn_clean_nodupes_texts.append(row[4])
        line_count += 1

CPU times: user 3min 1s, sys: 16.1 s, total: 3min 17s
Wall time: 3min 20s


In [13]:
len(cnn_clean_nodupes_texts)

6050

In [14]:
%%time
cnn_clean_nodupes_atall_texts = []
with open(cnn_no_dupesatall_dir) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:  # Skip header
            cnn_clean_nodupes_atall_texts.append(row[4])
        line_count += 1

CPU times: user 2min 3s, sys: 12 s, total: 2min 15s
Wall time: 2min 17s


In [15]:
len(cnn_clean_nodupes_atall_texts)

3942

In [16]:
%%time
# Smash all data together:
frames = [df_bbc_clean_nodupes_atall_csv, df_fox_clean_nodupes_atall_csv, df_rt_clean_nodupes_atall_csv]
df_master = pd.concat(frames, ignore_index=True)

CPU times: user 86.1 ms, sys: 270 ms, total: 356 ms
Wall time: 416 ms


In [17]:
df_master

Unnamed: 0,source_url,url,title,movies,text,keywords,meta_keywords,tags,authors,publish_date,...,rss_link,rss_id,rss_published,rss_published_parsed,rss_feedburner_origlink,paper_section_name,source,source_detail,pull_type,pull_date
0,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47063405,How has business been affected by Brexit so far?,[],With two months to go until the UK is due to l...,[],[''],set([]),"[u'Economics Correspondent', u'Dharshini David']",,...,https://www.bbc.co.uk/news/business-47063405,https://www.bbc.co.uk/news/business-47063405,"Thu, 31 Jan 2019 00:03:21 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
1,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47066873,China's factory activity shrinks as slowdown w...,[],Chinese factory activity contracted for a seco...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47066873,https://www.bbc.co.uk/news/business-47066873,"Thu, 31 Jan 2019 02:53:39 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
2,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47057870,MPs say fast fashion brands inaction on ethics...,[],"Fashion retailers JD Sports, Sports Direct and...",[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47057870,https://www.bbc.co.uk/news/business-47057870,"Thu, 31 Jan 2019 00:12:15 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
3,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47055188,Brexit: Car investment halves as industry hits...,[],Investment in the UK car sector almost halved ...,[],[''],set([]),"[u'Business Reporter', u'Russell Hotten', u'Bb...",,...,https://www.bbc.co.uk/news/business-47055188,https://www.bbc.co.uk/news/business-47055188,"Thu, 31 Jan 2019 00:04:41 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
4,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47065972,Facebook users continue to grow despite privac...,[],Facebook users have continued to rise despite ...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47065972,https://www.bbc.co.uk/news/business-47065972,"Wed, 30 Jan 2019 22:25:12 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
5,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062145,Fed puts future rate rises on hold as pledges ...,[],The Federal Reserve has indicated it won't rai...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062145,https://www.bbc.co.uk/news/business-47062145,"Wed, 30 Jan 2019 21:17:01 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
6,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47063403,Bosses' group head in abrupt departure,[],The boss of one of the UK's largest business l...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47063403,https://www.bbc.co.uk/news/business-47063403,"Wed, 30 Jan 2019 18:19:35 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
7,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062142,Foxconn reconsiders Wisconsin factory plans,[],"Foxconn, which raised hopes of a US manufactur...",[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062142,https://www.bbc.co.uk/news/business-47062142,"Wed, 30 Jan 2019 19:41:24 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
8,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062146,Tesla reports profit as issues stabilise,[],Tesla made a profit of $139.5m (£106.4m) in th...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062146,https://www.bbc.co.uk/news/business-47062146,"Wed, 30 Jan 2019 23:51:34 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
9,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47060676,Barclays shifts billions of pounds to Dublin b...,[],Barclays is moving €190bn (£166bn) of assets t...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47060676,https://www.bbc.co.uk/news/business-47060676,"Wed, 30 Jan 2019 16:37:31 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31


### Topic Modelling
(based on https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/):

In [18]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [19]:
%%time
# Basic text cleaning:
df_master['clean_text'] = df_master['text'].apply(functions.clean_text_string,
                                                  keep_dbl_newline=False)
cnn_nodupes_atall_texts_cleaned = []
for text in cnn_clean_nodupes_atall_texts:
    cnn_nodupes_atall_texts_cleaned.append(functions.clean_text_string(text, keep_dbl_newline=False))

CPU times: user 11.6 s, sys: 189 ms, total: 11.8 s
Wall time: 11.8 s


In [20]:
# Enable stop words, for later clearing:
stop_words = stopwords.words('english')
# "bbc radio live" appears often, so add that to stop words; 'may' also seem turn up a lot, but not mean much:
stop_words.extend(['also', 'bbc', 'radio', 'live', 'may'])

In [21]:
# Convert our text strings to list:
data = df_master.clean_text.values.tolist()
data.extend(cnn_nodupes_atall_texts_cleaned)

In [22]:
# Extra text cleaning, just in case:

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [23]:
data_words = list(functions.sent_to_words(data))

#### Remove Stopwords, Make Bigrams/Trigrams and Lemmatize:

In [24]:
# Define functions for stopwords, bigrams, trigrams
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [25]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [26]:
%%time
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=70) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_words], threshold=70)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 45.8 s, sys: 151 ms, total: 46 s
Wall time: 46 s


In [27]:
%%time
# Form Bigrams:
data_words_bigrams = make_bigrams(data_words_nostops)

CPU times: user 14.3 s, sys: 146 ms, total: 14.5 s
Wall time: 14.5 s


In [28]:
%%time
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

CPU times: user 25min 21s, sys: 1min 1s, total: 26min 22s
Wall time: 6min 40s


#### Create the Dictionary and Corpus needed for Topic Modeling:

In [29]:
%%time
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

CPU times: user 7.2 s, sys: 126 ms, total: 7.32 s
Wall time: 7.32 s


In [30]:
import time

#### Building LDA Model:

In [31]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        start = time.time()
        print('Topic modelling for num_topics=', num_topics)
        #model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        #model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        end = time.time()
        print('Finished num_topics', num_topics, 'with time', end - start)
        
    return model_list, coherence_values

In [32]:
%%time

# Can take a long time to run.
model_list2, coherence_values2 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=10, step=1)

Topic modelling for num_topics= 2


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 2 with time 306.1885087490082
Topic modelling for num_topics= 3
Finished num_topics 3 with time 310.4450078010559
Topic modelling for num_topics= 4
Finished num_topics 4 with time 334.1452827453613
Topic modelling for num_topics= 5
Finished num_topics 5 with time 369.00795006752014
Topic modelling for num_topics= 6
Finished num_topics 6 with time 367.96039485931396
Topic modelling for num_topics= 7
Finished num_topics 7 with time 403.13715291023254
Topic modelling for num_topics= 8
Finished num_topics 8 with time 440.6737549304962
Topic modelling for num_topics= 9
Finished num_topics 9 with time 476.2478747367859
CPU times: user 2h 43min 57s, sys: 7min 54s, total: 2h 51min 51s
Wall time: 50min 7s


In [33]:
%%time
model_list3, coherence_values3 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=10, limit=15, step=1)

Topic modelling for num_topics= 10


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 10 with time 513.1146211624146
Topic modelling for num_topics= 11
Finished num_topics 11 with time 587.534184217453
Topic modelling for num_topics= 12
Finished num_topics 12 with time 571.5289018154144
Topic modelling for num_topics= 13
Finished num_topics 13 with time 599.4308750629425
Topic modelling for num_topics= 14
Finished num_topics 14 with time 628.6747941970825
CPU times: user 2h 35min 3s, sys: 8min 45s, total: 2h 43min 49s
Wall time: 48min 20s


In [34]:
%%time
model_list4, coherence_values4 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=15, limit=21, step=1)

Topic modelling for num_topics= 15


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 15 with time 691.0951118469238
Topic modelling for num_topics= 16
Finished num_topics 16 with time 705.7277450561523
Topic modelling for num_topics= 17
Finished num_topics 17 with time 764.0610220432281
Topic modelling for num_topics= 18
Finished num_topics 18 with time 802.133266210556
Topic modelling for num_topics= 19
Finished num_topics 19 with time 842.1322269439697
Topic modelling for num_topics= 20
Finished num_topics 20 with time 836.864264011383
CPU times: user 4h 1min 16s, sys: 13min 38s, total: 4h 14min 54s
Wall time: 1h 17min 22s


In [57]:
%%time
model_list5, coherence_values5 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=21, limit=25, step=1)

Topic modelling for num_topics= 21


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 21 with time 836.3872618675232
Topic modelling for num_topics= 22
Finished num_topics 22 with time 882.4118032455444
Topic modelling for num_topics= 23
Finished num_topics 23 with time 941.8384079933167
Topic modelling for num_topics= 24
Finished num_topics 24 with time 1067.5023818016052
CPU times: user 2h 51min 41s, sys: 10min 31s, total: 3h 2min 12s
Wall time: 1h 2min 8s


In [58]:
%%time
model_list6, coherence_values6 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=25, limit=30, step=1)

Topic modelling for num_topics= 25


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 25 with time 1114.7858200073242
Topic modelling for num_topics= 26
Finished num_topics 26 with time 982.3471877574921
Topic modelling for num_topics= 27
Finished num_topics 27 with time 983.6640148162842
Topic modelling for num_topics= 28
Finished num_topics 28 with time 1026.3176009654999
Topic modelling for num_topics= 29
Finished num_topics 29 with time 1049.0155730247498
CPU times: user 3h 42min 50s, sys: 12min 13s, total: 3h 55min 4s
Wall time: 1h 25min 56s


In [67]:
%%time
model_list7, coherence_values7 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=30, limit=46, step=1)

Topic modelling for num_topics= 30


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 30 with time 1263.7574751377106
Topic modelling for num_topics= 31
Finished num_topics 31 with time 1279.618304014206
Topic modelling for num_topics= 32
Finished num_topics 32 with time 1066.8624303340912
Topic modelling for num_topics= 33
Finished num_topics 33 with time 1150.6826491355896
Topic modelling for num_topics= 34
Finished num_topics 34 with time 1294.0996840000153
Topic modelling for num_topics= 35
Finished num_topics 35 with time 1459.2808079719543
Topic modelling for num_topics= 36
Finished num_topics 36 with time 1467.4793231487274
Topic modelling for num_topics= 37
Finished num_topics 37 with time 1598.359689950943
Topic modelling for num_topics= 38
Finished num_topics 38 with time 1523.7713379859924
Topic modelling for num_topics= 39
Finished num_topics 39 with time 1295.1958329677582
Topic modelling for num_topics= 40
Finished num_topics 40 with time 1332.480339050293
Topic modelling for num_topics= 41
Finished num_topics 41 with time 1398.60217595

In [68]:
models = model_list2 + model_list3 + model_list4 + model_list5 + model_list6 + model_list7
coherence_values = coherence_values2 + coherence_values3 + coherence_values4 + coherence_values5 + coherence_values6 + coherence_values7

In [69]:
%%time
for model in models:
    name = 'models/2019_04_16_optimal_model_topics' + str(model.num_topics) + '.model'
    model.save(name)

CPU times: user 3.65 s, sys: 3.9 s, total: 7.55 s
Wall time: 10.1 s


In [33]:
cohvals_path = 'models/2019_04_16_coherences_values.txt'

In [31]:
%%time
# Load models:
models = []
directory_list = sorted(listdir('models/'))
for file in directory_list:
    if file.startswith('2019_04_16_optimal_model_topics') and file.endswith('.model'):
        model = gensim.models.LdaModel.load('models/' + file)
        models.append(model)

CPU times: user 4.28 s, sys: 2.04 s, total: 6.31 s
Wall time: 7.66 s


In [32]:
%%time
# Recalculate coherence scores:
coherence_values = []
model_num = 0
for model in models:
    print('Model Num:', model_num)
    coherencemodel = CoherenceModel(model=model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
    model_num += 1

Model Num: 0
Model Num: 1
Model Num: 2
Model Num: 3
Model Num: 4
Model Num: 5
Model Num: 6
Model Num: 7
Model Num: 8
Model Num: 9
Model Num: 10
Model Num: 11
Model Num: 12
Model Num: 13
Model Num: 14
Model Num: 15
Model Num: 16
Model Num: 17
Model Num: 18
Model Num: 19
Model Num: 20
Model Num: 21
Model Num: 22
Model Num: 23
Model Num: 24
Model Num: 25
Model Num: 26
Model Num: 27
Model Num: 28
Model Num: 29
Model Num: 30
Model Num: 31
Model Num: 32
Model Num: 33
Model Num: 34
Model Num: 35
Model Num: 36
Model Num: 37
Model Num: 38
Model Num: 39
Model Num: 40
Model Num: 41
Model Num: 42
Model Num: 43
CPU times: user 5min 59s, sys: 53.9 s, total: 6min 53s
Wall time: 1h 17min 51s


In [34]:
coherence_values

[0.30473636911922386,
 0.3249027080728624,
 0.36982381703272493,
 0.37701630145289894,
 0.3854396728864671,
 0.4329426205990265,
 0.4216982813809473,
 0.4555881561662689,
 0.4648887682306257,
 0.48768835708871144,
 0.46718835265743025,
 0.4684259779219651,
 0.4914104782543593,
 0.5022809284997043,
 0.5222553455726212,
 0.48111508767007305,
 0.4986223190092909,
 0.510688908508308,
 0.5174032245139688,
 0.531568668993731,
 0.5038390875972081,
 0.5378038250959394,
 0.48129993778376196,
 0.5140223690687207,
 0.5028178343016804,
 0.4574762825080305,
 0.4771254938124307,
 0.49710498803257114,
 0.4910268470164026,
 0.4620912283112936,
 0.48919822358988385,
 0.4959023379074466,
 0.5056126999180991,
 0.49691247796758997,
 0.5051982593916104,
 0.49407509555352913,
 0.4917995221636159,
 0.4768102656090244,
 0.4990408325751924,
 0.5088295606208432,
 0.48737351666448814,
 0.501472764690317,
 0.47604787833479184,
 0.4870424415545807]

In [38]:
%%time
with open(cohvals_path, 'wb') as fp:   #Pickling
    pickle.dump(coherence_values, fp)

CPU times: user 382 µs, sys: 810 µs, total: 1.19 ms
Wall time: 964 µs


In [39]:
## Load Coherence values:
with open(cohvals_path, 'rb') as fp:
    coherence_values = pickle.load(fp)

In [40]:
len(coherence_values)

44

In [41]:
x = np.arange(2,46)

In [113]:
# Create a trace
trace = go.Scatter(
    line = dict(width=1),
    x = x,
    y = coherence_values
)

layout = go.Layout(
    title=go.layout.Title(
        text='Topic Coherence',
        xref='paper',
        y=.88,
        font=dict(size=20)
    ),
     xaxis=dict(
         dtick=1,
         tick0=0,
         tickfont=dict(size=8),
         title='Number of Topics',
         titlefont=dict(size=14),
         tickangle=-45,
     ),
     yaxis=dict(
         dtick=.02,
         tick0=0,
         title='CV Coherence',
         tickfont=dict(size=10)
     ),
    annotations=[
    dict(
        x=23,
        y=0.5378038,
        xref='x',
        yref='y',
        text='(23, 0.5378038)',
        showarrow=True,
        arrowhead=3,
        ax=0,
        ay=-40
    )]
)


data = [trace]
fig1 = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

pio.write_image(fig1, 'papers/presentation/topic_coherence.pdf')

In [131]:
df_cnn = pd.DataFrame(columns=df_master.columns)

In [132]:
df_cnn['clean_text'] = cnn_nodupes_atall_texts_cleaned

In [134]:
df_cnn['source'] = 'cnn'

In [166]:
cnn_nodupes_atall_texts_cleaned[-1]

''

In [140]:
print(len(cnn_nodupes_atall_texts_cleaned))
print(len(df_master))

3942
11603


In [143]:
df_master_wcnn = pd.concat([df_master, df_cnn], ignore_index=True)

In [145]:
df_master_wcnn.to_csv('data/processed/2019_04_17_AllWCNN.csv', index=False)

### Adding dominant topics to data:

In [149]:
optimal_model = models[21] ## Model with 23 topics

In [150]:
%%time
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 17.9 µs


In [151]:
%%time
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

CPU times: user 37min 43s, sys: 3min 12s, total: 40min 55s
Wall time: 10min 45s


In [156]:
topic_modeling_df = pd.concat([df_master_wcnn, df_dominant_topic[['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']]], axis=1)

In [175]:
topic_modeling_df_dropped = topic_modeling_df.drop(df_master_wcnn[df_master_wcnn.clean_text == ''].index)

In [178]:
topic_modeling_df_dropped.to_csv('data/processed/2019_04_17_AllWCNN_WTopics_noemptystr.csv', index=False)

#### Visualize Topics:

In [196]:
cnn_filter = (topic_modeling_df_dropped.source == 'cnn')
fox_filter = (topic_modeling_df_dropped.source == 'fox')
rt_filter = (topic_modeling_df_dropped.source == 'rt')
bbc_filter = (topic_modeling_df_dropped.source == 'bbc')

In [557]:
x = np.random.randn(500)
trace1 = go.Histogram(x=topic_modeling_df_dropped[cnn_filter]['Dominant_Topic'], opacity=0.75, name='CNN',
                      marker=dict(
                          line=dict(width=.8)))
trace2 = go.Histogram(x=topic_modeling_df_dropped[fox_filter]['Dominant_Topic'], opacity=0.75, name='Fox News',
                      marker=dict(
                          line=dict(width=.8)))
trace3 = go.Histogram(x=topic_modeling_df_dropped[rt_filter]['Dominant_Topic'], opacity=0.75, name='RT',
                      marker=dict(
                          line=dict(width=.8)))
trace4 = go.Histogram(x=topic_modeling_df_dropped[bbc_filter]['Dominant_Topic'], opacity=0.75, name='BBC',
                      marker=dict(
                          line=dict(width=.8)))

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1
    ),
    title=go.layout.Title(
        text='Topic Distributions by Source',
        xref='paper',
        y=.88,
        font=dict(size=20)
    )
)

fig2 = tools.make_subplots(rows=2, cols=2)
fig2.append_trace(trace1, 1, 1)
fig2.append_trace(trace2, 1, 2)
fig2.append_trace(trace3, 2, 1)
fig2.append_trace(trace4, 2, 2)
fig2['layout'].update(title='Topic Distribution by Source',
                     xaxis1=dict(
                        tickmode='linear',
                        ticks='outside',
                        tick0=0,
                        dtick=1,
                         tickfont=dict(
            size=7
        )
                     ), 
                    yaxis1=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis2=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
                         tickfont=dict(
            size=7
        )
    ),yaxis2=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis3=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis3=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
                     xaxis4=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
         title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis4=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)))

plotly.offline.iplot(fig2, filename='basic histogram')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



In [558]:
pio.write_image(fig2, 'papers/presentation/topic_distribution.svg')

ValueError: 
For some reason plotly.py was unable to communicate with the
local orca server process, even though the server process seems to be running.

Please review the process and connection information below:

orca status
-----------
    state: running
    executable: /usr/local/bin/orca
    version: 1.2.1
    port: 51908
    pid: 18277
    command: ['/usr/local/bin/orca', 'serve', '-p', '51908', '--plotly', '/Users/alexander/anaconda3/lib/python3.6/site-packages/plotly/package_data/plotly.min.js', '--graph-only', '--mathjax', 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js']
    



### Interpret Topics:

In [231]:
#%%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
vis

In [555]:
topic_filter = (topic_modeling_df_dropped['Dominant_Topic'] == 17)
lookattopic = topic_modeling_df_dropped[['title','Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'clean_text']][topic_filter].sort_values(by=['Topic_Perc_Contrib'], ascending=False)
lookattopic.head(100)

Unnamed: 0,title,Dominant_Topic,Topic_Perc_Contrib,Keywords,clean_text
7854,Trump backs big ‘protests against ex-dictator ...,17.0,0.6455,"trump, president, congress, white_house, admin...",Venezuela’s opposition leader and self-declare...
8162,Cuba slams US for ‘meddling’ accusation on Ven...,17.0,0.6416,"trump, president, congress, white_house, admin...",Cuban Foreign Minister Bruno Rodriguez has rej...
7932,US threatens ‘serious consequences’ if Venezue...,17.0,0.6329,"trump, president, congress, white_house, admin...",National Security Advisor John Bolton has thre...
12990,,17.0,0.6078,"trump, president, congress, white_house, admin...",Washington (CNN) President Donald Trump on Mon...
8126,Maduro slams Trump in English (VIDEO) — RT Wor...,17.0,0.6041,"trump, president, congress, white_house, admin...",Venezuelan leader Nicolas Maduro addressed Don...
7851,US sanctions Nicaragua oil company over Venezu...,17.0,0.6018,"trump, president, congress, white_house, admin...","Washington has imposed sanctions on Albanisa, ..."
3269,Venezuela crisis: Juan Guaidó to return after ...,17.0,0.6001,"trump, president, congress, white_house, admin...",Venezuela's self-proclaimed interim president ...
398,Venezuela crisis: US sending aid 'at Guaidó's ...,17.0,0.5915,"trump, president, congress, white_house, admin...",The US says it is sending aid to crisis-hit Ve...
8951,US is openly pushing Venezuela’s army into a coup,17.0,0.5733,"trump, president, congress, white_house, admin...",Forced regime change has apparently become Was...
106,"Venezuela opposition 'has met military', says ...",17.0,0.5715,"trump, president, congress, white_house, admin...",Venezuela's opposition leader Juan Guaidó has ...


In [552]:
peak_idx = 7854
print(lookattopic.Keywords[peak_idx])
print(lookattopic.clean_text[peak_idx])

trump, president, congress, white_house, administration, mueller, country, special_counsel, government, russia
Venezuela’s opposition leader and self-declared ‘interim president’ Juan Guaido chatted by phone with US President Donald Trump, with Trump giving the thumbs up to “large protests” against “former dictator Maduro.” The phone call occurred Wednesday as anti-government demonstrators in Venezuela were about to take to the streets for the second time in a week. Trump “noted the importance” of the protests against “former dictator Maduro,” and vowed “strong support for Venezuela’s fight to regain its democracy,” according to White House Press Secretary Sarah Sanders. Nicolas Maduro remains Venezuela’s elected leader, despite a number of countries declaring their support for Guaidó’s claim as the acting head. Trump, meanwhile, congratulated Guaidó on his “historic assumption of the presidency.” Neither President Maduro or predecessor Hugo Chavez ever received a phone call from a US 

In [413]:
topic_translation = {0: 'Unknown',
                     1: 'Sports',
                     2: 'Political Investigation (Cohen/Virginia)',
                     3: 'Military Nuclear Technology',
                     4: 'Film Industry',
                     5: 'Cybersecurity',
                     6: 'Entertainment Industry',
                     7: 'Health',
                     8: 'Brexit',
                     9: 'Space Industry',
                     10: 'US Campaign Politics',
                     11: 'Medical Research',
                     12: 'Economy',
                     13: 'Emergencies',
                     14: 'Religious Intolerance',
                     15: 'Family Affairs',
                     16: 'International Affairs (Middle East and South Asia)',
                     17: 'International Affairs (Latin America)',
                     18: 'Corporate News',
                     19: 'Fashion/Issues of Race',
                     20: 'Crime',
                     21: 'Social Media',
                     22: 'Nature and Environment'}

In [414]:
%%time

# Save Topic Dictionary:
topic_dict_path = 'models/2019_04_16_Topic_Dictionary(23_topics)'
with open(topic_dict_path, 'wb') as fp:   #Pickling
    pickle.dump(topic_translation, fp)

CPU times: user 502 µs, sys: 1.37 ms, total: 1.87 ms
Wall time: 2.81 ms


In [350]:
%%time
topic_modeling_df_dropped['Topic'] = topic_modeling_df_dropped['Dominant_Topic'].map(topic_translation)

CPU times: user 4.25 ms, sys: 770 µs, total: 5.02 ms
Wall time: 4.49 ms


In [354]:
# Save results:
topic_modeling_df_dropped.to_csv('data/processed/2019_04_17_AllWCNN_WTopics_noemptystr_inttopics.csv', index=False)

In [550]:
# Output the topics-probability table:
model_topics = optimal_model.show_topics(formatted=False, num_words=20)
#pprint(optimal_model.print_topics(num_words=20))
model_topics.sort()
with open('papers/presentation/topics_probs.csv', mode='w') as csv_file:
    fieldnames = ['Topic', 'Token', 'Weight']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for topic_num, words in model_topics:
        topic = topic_translation[topic_num]
        for word, prop in words:
            writer.writerow({'Topic': topic, 'Token': word, 'Weight': prop})

In [356]:
# TODO: Historgram of topic counts by source

### Vectorization:

In [159]:
%%time
# https://stackoverflow.com/questions/42094180/spacy-how-to-load-google-news-word2vec-vectors
import gensim
import spacy

# Path to google news vectors
google_news_path = "data/word_embeddings/GoogleNews-vectors-negative300.bin.gz"

# Load google news vecs in gensim
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_path, binary=True)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = []
for idx in range(3000000):
    keys.append(model.index2word[idx])

# Set the vectors for our nlp object to the google news vectors
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

CPU times: user 2min 57s, sys: 17.9 s, total: 3min 15s
Wall time: 3min 19s


In [355]:
%%time
# Vectorize texts:
vectorized_texts = nlp(topic_modeling_df_dropped['clean_text'][0]).vector
idx = 1
for text in topic_modeling_df_dropped[1:]['clean_text']:
    vector = nlp(text).vector
    vectorized_texts = np.vstack([vectorized_texts, vector])

CPU times: user 25min 49s, sys: 11min 58s, total: 37min 48s
Wall time: 12min 40s


In [365]:
%%time
# Calculate cosine similarity matrix (n_jobs = -1 means use all CPU cores):
cos_sim_mat = (pairwise_distances(vectorized_texts, metric='cosine', n_jobs = -1))

CPU times: user 4.38 s, sys: 1min 46s, total: 1min 51s
Wall time: 1min 59s


In [369]:
cos_sim_mat.shape

(15500, 15500)

In [494]:
def make_bias_df(cnn_idx_arr, target_source_idx_arr, target_article_category, topic, cos_sim_mat):
    """
    """
    similarity_df = pd.DataFrame(columns=['cnn_article', 'target_article', 
                                          'target_article_category', 'topic', 'cos_similarity'])
    outer_loop = 0
    for cnn_idx in cnn_idx_arr:
        print('OUTER LOOP', outer_loop)
        for target_idx in target_source_idx_arr:
            row = {'cnn_article': cnn_idx,
                   'target_article': target_idx,
                   'target_article_category': target_article_category,
                   'topic': topic,
                   'cos_similarity': cos_sim_mat[cnn_idx, target_idx]}
            similarity_df = similarity_df.append(row, ignore_index=True)
        outer_loop += 1
    return similarity_df

def make_bias_df(cnn_idx_arr, target_source_idx_arr, target_article_category, topic, cos_sim_mat):
    """
    """
    similarity_df = pd.DataFrame(columns=['target_article_category', 'topic', 'cos_similarity'])
    filtered = cos_sim_mat[cnn_idx_arr, :][:, target_source_idx_arr]
    similarity_df['cos_similarity'] = filtered.flatten()
    similarity_df['topic'] = topic
    similarity_df['target_article_category'] = target_article_category
#     outer_loop = 0
#     for cnn_idx in cnn_idx_arr:
#         print('OUTER LOOP', outer_loop)
#         for target_idx in target_source_idx_arr:
#             row = {'cnn_article': cnn_idx,
#                    'target_article': target_idx,
#                    'target_article_category': target_article_category,
#                    'topic': topic,
#                    'cos_similarity': cos_sim_mat[cnn_idx, target_idx]}
#             similarity_df = similarity_df.append(row, ignore_index=True)
#         outer_loop += 1
    return similarity_df

In [381]:
topics = topic_modeling_df_dropped.Topic.unique()
sources = topic_modeling_df_dropped.source.unique()
baseline_source = 'cnn'

In [379]:
cos_bias_df = pd.DataFrame(columns=['Topic', 'Baseline Source', 'Baseline Article Index', 'Target Source', 'Target Source Article Index', 'Bias'])

In [402]:
all_topic_matrices = []
all_topic_indx = np.arange(0, 23)
print(all_topic_indx)
for topic_idx in all_topic_indx:
    topic_filter = (topic_modeling_df_dropped.Dominant_Topic == all_topic_indx[topic_idx])
    idx_thistopic = topic_modeling_df_dropped[topic_filter].index.values
    
    baseline_filter = (topic_modeling_df_dropped.source == baseline_source)
    othersource_filter = (topic_modeling_df_dropped.source != baseline_source)
    
    idx_baseline = topic_modeling_df_dropped[topic_filter & baseline_filter].index.values
    idx_other = topic_modeling_df_dropped[topic_filter & othersource_filter].index.values
    
    baseline_mat
    
#     for current_topic_idx in idx_thistopic:
#         other_sources_idxs = 
#         print(current_topic_idx)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]


In [399]:
topic_filter = (topic_modeling_df_dropped.Dominant_Topic == 0)
idx_thistopic = topic_modeling_df_dropped[topic_filter].index.values
baseline_filter = (topic_modeling_df_dropped.source != baseline_source)
idx_baseline = topic_modeling_df_dropped[topic_filter & baseline_filter].index.values


In [412]:
topic_translation = {0: 'Unknown',
                     1: 'Sports',
                     2: 'Political Investigation (Cohen/Virginia)',
                     3: 'Military Nuclear Technology',
                     4: 'Film Industry',
                     5: 'Cybersecurity',
                     6: 'Entertainment Industry',
                     7: 'Health',
                     8: 'Brexit',
                     9: 'Space Industry',
                     10: 'US Campaign Politics',
                     11: 'Medical Research',
                     12: 'Economy',
                     13: 'Emergencies',
                     14: 'Religious Intolerance',
                     15: 'Family Affairs',
                     16: 'International Affairs (Middle East and South Asia)',
                     17: 'International Affairs (Latin America)',
                     18: 'Corporate News',
                     19: 'Fashion/Issues of Race',
                     20: 'Crime',
                     21: 'Social Media',
                     22: 'Nature and Environment'}

In [519]:
%%time
chosen_topics = [1, 2, 10, 12, 13, 15, 18]
all_dfs = []
baseline_source = 'cnn'
for topic_num in chosen_topics:
    print('TOPIC NUM', topic_num)
    topic_filter = (topic_modeling_df_dropped.Dominant_Topic == topic_num)
    idx_thistopic = topic_modeling_df_dropped[topic_filter].index.values
    
    baseline_filter = (topic_modeling_df_dropped.source == baseline_source)
    othersource_filter = (topic_modeling_df_dropped.source != baseline_source)
    
    idx_baseline = topic_modeling_df_dropped[topic_filter & baseline_filter].index.values
    idx_other = topic_modeling_df_dropped[topic_filter & othersource_filter].index.values

    other_sources_df = topic_modeling_df_dropped[topic_filter & othersources_filter]
    for source in other_sources_df.source.unique():
        print('SOURCE:', source)
        other_source_filter = (other_sources_df.source == source)
        other_source_df = other_sources_df[other_source_filter]  # Other source in this topic
        
        other_source_idx = other_source_df.index.values

        bias_df = make_bias_df(cnn_idx_arr=idx_baseline, 
                           target_source_idx_arr=other_source_idx, 
                           target_article_category=source,
                           topic=topic_num, 
                           cos_sim_mat=cos_sim_mat)
        print(len(bias_df))
        all_dfs.append(bias_df)

TOPIC NUM 1
SOURCE: bbc
14673
SOURCE: fox
11859
SOURCE: rt
20234
TOPIC NUM 2
SOURCE: bbc
329175
SOURCE: fox
454608
SOURCE: rt
317394
TOPIC NUM 10
SOURCE: bbc
4508
SOURCE: fox
23667
SOURCE: rt
10465
TOPIC NUM 12
SOURCE: bbc
72688
SOURCE: fox
24072
SOURCE: rt
51684
TOPIC NUM 13
SOURCE: bbc
17017
SOURCE: fox
23205
SOURCE: rt
21182
TOPIC NUM 15
SOURCE: bbc
525
SOURCE: fox
987
SOURCE: rt
336
TOPIC NUM 18
SOURCE: bbc
2904
SOURCE: fox
1628
SOURCE: rt
704
CPU times: user 549 ms, sys: 211 ms, total: 760 ms
Wall time: 747 ms


In [520]:
dfs = pd.concat(all_dfs)

In [522]:
dfs.groupby(['topic', 'target_article_category']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
topic,target_article_category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,bbc,14673.0,0.169127,0.075379,0.020208,0.114543,0.156483,0.21019,0.529444
1,fox,11859.0,0.191606,0.057559,0.020436,0.152316,0.185982,0.226783,0.561031
1,rt,20234.0,0.160198,0.052404,0.017078,0.125263,0.156119,0.190256,0.493792
2,bbc,329175.0,0.151207,0.045357,0.01386,0.119517,0.145975,0.177093,0.473316
2,fox,454608.0,0.163898,0.055034,0.011135,0.123986,0.158297,0.197952,0.463171
2,rt,317394.0,0.155853,0.048168,0.015631,0.121813,0.150385,0.183787,0.424137
10,bbc,4508.0,0.114493,0.046113,0.018553,0.078551,0.109964,0.143592,0.322399
10,fox,23667.0,0.119737,0.052337,0.013602,0.081506,0.110652,0.149507,0.385575
10,rt,10465.0,0.122466,0.050253,0.023657,0.084743,0.114,0.151175,0.34766
12,bbc,72688.0,0.152829,0.045116,0.017159,0.121037,0.148329,0.179523,0.373009


In [525]:
from statsmodels.stats.weightstats import ttest_ind

In [539]:
# Sports:
data1_1 = dfs[(dfs.topic == 1) & (dfs.target_article_category == 'bbc')].cos_similarity
data2_1 = dfs[(dfs.topic == 1) & (dfs.target_article_category == 'rt')].cos_similarity
data3_1 = dfs[(dfs.topic == 1) & (dfs.target_article_category == 'fox')].cos_similarity

# Political Investigation:
data1_2 = dfs[(dfs.topic == 2) & (dfs.target_article_category == 'bbc')].cos_similarity
data2_2 = dfs[(dfs.topic == 2) & (dfs.target_article_category == 'rt')].cos_similarity
data3_2 = dfs[(dfs.topic == 2) & (dfs.target_article_category == 'fox')].cos_similarity

# Family Affairs:
data1_3 = dfs[(dfs.topic == 15) & (dfs.target_article_category == 'bbc')].cos_similarity
data2_3 = dfs[(dfs.topic == 15) & (dfs.target_article_category == 'rt')].cos_similarity
data3_3 = dfs[(dfs.topic == 15) & (dfs.target_article_category == 'fox')].cos_similarity




print(ttest_ind(data1_1, data1_2, alternative='two-sided', usevar='unequal'))
print(ttest_ind(data2_1, data2_2, alternative='two-sided', usevar='unequal'))
print(ttest_ind(data3_1, data3_2, alternative='two-sided', usevar='unequal'))

(28.569813463633917, 6.66627851194099e-175, 15149.283945987134)
(11.48994175404949, 1.803216809039804e-30, 22467.251482507054)
(51.809587918965775, 0.0, 12430.22179165013)


In [541]:
# BBC: Sports vs Political Investigation:
print(np.mean(data1_1) - np.mean(data1_2))
# RT: Sports vs Political Investigation:
print(np.mean(data2_1) - np.mean(data2_2))
# FOX: Sports vs Political Investigation:
print(np.mean(data3_1) - np.mean(data3_2))

# BBC: Political Investigation vs Family Affairs:
print(np.mean(data1_2) - np.mean(data1_3))
# RT: Political Investigation vs Family Affairs:
print(np.mean(data2_2) - np.mean(data2_3))
# FOX: Political Investigation vs Family Affairs:
print(np.mean(data3_2) - np.mean(data3_3))



0.017919614911079407
0.004344955086708069
0.02770853042602539
-0.025405123829841614
-0.018779411911964417
-0.0314674973487854


In [516]:
bias_df_cnn_fox.groupby(['topic', 'target_article_category']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
topic,target_article_category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,fox,11859.0,0.191606,0.057559,0.020436,0.152316,0.185982,0.226783,0.561031


In [517]:
11859.0 * 3

35577.0

In [514]:
dfs.groupby(['topic', 'target_article_category']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
topic,target_article_category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,bbc,46766.0,0.170963,0.063009,0.017078,0.128123,0.163974,0.20557,0.561031
1,fox,46766.0,0.170963,0.063009,0.017078,0.128123,0.163974,0.20557,0.561031
1,rt,46766.0,0.170963,0.063009,0.017078,0.128123,0.163974,0.20557,0.561031
2,bbc,1101177.0,0.157782,0.050619,0.011135,0.121825,0.151773,0.187499,0.473316
2,fox,1101177.0,0.157782,0.050619,0.011135,0.121825,0.151773,0.187499,0.473316
2,rt,1101177.0,0.157782,0.050619,0.011135,0.121825,0.151773,0.187499,0.473316
10,bbc,38640.0,0.119865,0.051136,0.013602,0.082083,0.111532,0.149225,0.385575
10,fox,38640.0,0.119865,0.051136,0.013602,0.082083,0.111532,0.149225,0.385575
10,rt,38640.0,0.119865,0.051136,0.013602,0.082083,0.111532,0.149225,0.385575
12,bbc,148444.0,0.170088,0.052384,0.017159,0.132183,0.164504,0.202935,0.42231


In [449]:
%%time
bias_df_cnn_fox = make_bias_df(cnn_idx_arr=cnn_topic1_idx, 
                       target_source_idx_arr=fox_topic1_idx, 
                       target_article_category='fox',
                       topic=1, 
                       cos_sim_mat=cos_sim_mat)

OUTER LOOP 0
OUTER LOOP 1
OUTER LOOP 2
OUTER LOOP 3
OUTER LOOP 4
OUTER LOOP 5
OUTER LOOP 6
OUTER LOOP 7
OUTER LOOP 8
OUTER LOOP 9
OUTER LOOP 10
OUTER LOOP 11
OUTER LOOP 12
OUTER LOOP 13
OUTER LOOP 14
OUTER LOOP 15
OUTER LOOP 16
OUTER LOOP 17
OUTER LOOP 18
OUTER LOOP 19
OUTER LOOP 20
OUTER LOOP 21
OUTER LOOP 22
OUTER LOOP 23
OUTER LOOP 24
OUTER LOOP 25
OUTER LOOP 26
OUTER LOOP 27
OUTER LOOP 28
OUTER LOOP 29
OUTER LOOP 30
OUTER LOOP 31
OUTER LOOP 32
OUTER LOOP 33
OUTER LOOP 34
OUTER LOOP 35
OUTER LOOP 36
OUTER LOOP 37
OUTER LOOP 38
OUTER LOOP 39
OUTER LOOP 40
OUTER LOOP 41
OUTER LOOP 42
OUTER LOOP 43
OUTER LOOP 44
OUTER LOOP 45
OUTER LOOP 46
OUTER LOOP 47
OUTER LOOP 48
OUTER LOOP 49
OUTER LOOP 50
OUTER LOOP 51
OUTER LOOP 52
OUTER LOOP 53
OUTER LOOP 54
OUTER LOOP 55
OUTER LOOP 56
OUTER LOOP 57
OUTER LOOP 58
OUTER LOOP 59
OUTER LOOP 60
OUTER LOOP 61
OUTER LOOP 62
OUTER LOOP 63
OUTER LOOP 64
OUTER LOOP 65
OUTER LOOP 66
CPU times: user 1min 10s, sys: 535 ms, total: 1min 11s
Wall time: 1min 1

In [450]:
bias_df_cnn_fox

Unnamed: 0,cnn_article,target_article,target_article_category,topic,cos_similarity
0,11780,3701,fox,1,0.171282
1,11780,3702,fox,1,0.188764
2,11780,3705,fox,1,0.253975
3,11780,3706,fox,1,0.136744
4,11780,3708,fox,1,0.300441
5,11780,3709,fox,1,0.163745
6,11780,3711,fox,1,0.201322
7,11780,3712,fox,1,0.213863
8,11780,3716,fox,1,0.200604
9,11780,3718,fox,1,0.232074


In [495]:
%%time
test_df = make_bias_df(cnn_idx_arr=cnn_topic1_idx, 
                       target_source_idx_arr=fox_topic1_idx, 
                       target_article_category='fox',
                       topic=1, 
                       cos_sim_mat=cos_sim_mat)

CPU times: user 5.5 ms, sys: 2.16 ms, total: 7.67 ms
Wall time: 6.22 ms


In [496]:
test_df

Unnamed: 0,target_article_category,topic,cos_similarity
0,fox,1,0.171282
1,fox,1,0.188764
2,fox,1,0.253975
3,fox,1,0.136744
4,fox,1,0.300441
5,fox,1,0.163745
6,fox,1,0.201322
7,fox,1,0.213863
8,fox,1,0.200604
9,fox,1,0.232074


In [486]:
filt1.flatten()

array([0.17128193, 0.18876427, 0.25397456, ..., 0.12662601, 0.1573745 ,
       0.18759167], dtype=float32)

In [488]:
cnn_topic1_idx

array([11780, 11782, 11883, 11917, 11936, 11939, 11970, 11993, 12095,
       12182, 12183, 12283, 12391, 12415, 12419, 12488, 12642, 12738,
       12832, 12942, 13026, 13122, 13199, 13200, 13229, 13241, 13286,
       13319, 13320, 13321, 13348, 13358, 13385, 13673, 13686, 13758,
       13833, 13923, 13926, 13928, 13930, 14040, 14098, 14171, 14213,
       14259, 14283, 14388, 14482, 14483, 14567, 14568, 14664, 14665,
       14713, 14714, 14779, 14790, 15034, 15042, 15195, 15197, 15198,
       15275, 15355, 15356, 15462])

In [385]:
cos_sim_mat.shape

(15500, 15500)

In [386]:
15500 ** 2

240250000