In [362]:
%load_ext autoreload
%autoreload 2
import sys
from os import listdir
import csv
import pprint
import pickle
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
from plotly import tools
from helpers import functions
from sklearn.metrics import pairwise_distances

plotly.offline.init_notebook_mode(connected=True)
csv.field_size_limit(sys.maxsize)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


9223372036854775807

### Load the data:

In [2]:
bbc_no_dupes_dir = 'data/processed/2019_03_27_BBC_NoDupesWithin.csv'
bbc_no_dupesatall_dir = 'data/processed/2019_03_27_BBC_NoDupesAtAll.csv'
fox_no_dupes_dir = 'data/processed/2019_03_27_FOX_NoDupesWithin.csv'
fox_no_dupesatall_dir = 'data/processed/2019_03_27_FOX_NoDupesAtAll.csv'
cnn_no_dupes_dir = 'data/processed/2019_03_28_CNN_NoDupesWithin.csv'
cnn_no_dupesatall_dir = 'data/processed/2019_03_28_CNN_NoDupesAtAll.csv'
rt_no_dupes_dir = 'data/processed/2019_03_26_RT_NoDupesWithin.csv'
rt_no_dupesatall_dir = 'data/processed/2019_03_26_RT_NoDupesAtAll.csv'

In [3]:
cols_to_keep = ['source',
                'paper_section_name',
                'source_url', 
                'url',
                'canonical_link',
                'rss_link',
                'title',
                'rss_title',
                'text',
                'paper_section_name',
                'pull_date']

BBC:

In [4]:
%%time
df_bbc_clean_nodupes_csv = pd.read_csv(bbc_no_dupes_dir)

CPU times: user 9.16 s, sys: 1.48 s, total: 10.6 s
Wall time: 10.7 s


In [5]:
%%time
df_bbc_clean_nodupes_atall_csv = pd.read_csv(bbc_no_dupesatall_dir)

CPU times: user 7.28 s, sys: 1.3 s, total: 8.58 s
Wall time: 8.68 s


Fox News:

In [6]:
%%time
df_fox_clean_nodupes_csv = pd.read_csv(fox_no_dupes_dir)

CPU times: user 3.31 s, sys: 514 ms, total: 3.83 s
Wall time: 3.84 s


In [7]:
%%time
df_fox_clean_nodupes_atall_csv = pd.read_csv(fox_no_dupesatall_dir)

CPU times: user 3.36 s, sys: 860 ms, total: 4.22 s
Wall time: 4.28 s


RT:

In [8]:
%%time
df_rt_clean_nodupes_csv = pd.read_csv(rt_no_dupes_dir)

CPU times: user 4.72 s, sys: 1.51 s, total: 6.23 s
Wall time: 6.46 s


In [9]:
%%time
df_rt_clean_nodupes_atall_csv = pd.read_csv(rt_no_dupesatall_dir)

CPU times: user 3.21 s, sys: 654 ms, total: 3.87 s
Wall time: 3.9 s


Loading CNN csv takes too long and they are too bulky. Let's use the 'csv' module instead:

In [10]:
# %%time
# df_cnn_clean_nodupes_csv = pd.read_csv(cnn_no_dupes_dir, names=['text'])

In [11]:
# %%time
# df_cnn_clean_nodupes_atall_csv = pd.read_csv(cnn_no_dupesatall_dir, names=cols_to_keep)

In [12]:
%%time
cnn_clean_nodupes_texts = []
with open(cnn_no_dupes_dir) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:  # Skip header
            cnn_clean_nodupes_texts.append(row[4])
        line_count += 1

CPU times: user 3min 1s, sys: 16.1 s, total: 3min 17s
Wall time: 3min 20s


In [13]:
len(cnn_clean_nodupes_texts)

6050

In [14]:
%%time
cnn_clean_nodupes_atall_texts = []
with open(cnn_no_dupesatall_dir) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:  # Skip header
            cnn_clean_nodupes_atall_texts.append(row[4])
        line_count += 1

CPU times: user 2min 3s, sys: 12 s, total: 2min 15s
Wall time: 2min 17s


In [15]:
len(cnn_clean_nodupes_atall_texts)

3942

In [16]:
%%time
# Smash all data together:
frames = [df_bbc_clean_nodupes_atall_csv, df_fox_clean_nodupes_atall_csv, df_rt_clean_nodupes_atall_csv]
df_master = pd.concat(frames, ignore_index=True)

CPU times: user 86.1 ms, sys: 270 ms, total: 356 ms
Wall time: 416 ms


In [17]:
df_master

Unnamed: 0,source_url,url,title,movies,text,keywords,meta_keywords,tags,authors,publish_date,...,rss_link,rss_id,rss_published,rss_published_parsed,rss_feedburner_origlink,paper_section_name,source,source_detail,pull_type,pull_date
0,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47063405,How has business been affected by Brexit so far?,[],With two months to go until the UK is due to l...,[],[''],set([]),"[u'Economics Correspondent', u'Dharshini David']",,...,https://www.bbc.co.uk/news/business-47063405,https://www.bbc.co.uk/news/business-47063405,"Thu, 31 Jan 2019 00:03:21 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
1,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47066873,China's factory activity shrinks as slowdown w...,[],Chinese factory activity contracted for a seco...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47066873,https://www.bbc.co.uk/news/business-47066873,"Thu, 31 Jan 2019 02:53:39 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
2,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47057870,MPs say fast fashion brands inaction on ethics...,[],"Fashion retailers JD Sports, Sports Direct and...",[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47057870,https://www.bbc.co.uk/news/business-47057870,"Thu, 31 Jan 2019 00:12:15 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
3,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47055188,Brexit: Car investment halves as industry hits...,[],Investment in the UK car sector almost halved ...,[],[''],set([]),"[u'Business Reporter', u'Russell Hotten', u'Bb...",,...,https://www.bbc.co.uk/news/business-47055188,https://www.bbc.co.uk/news/business-47055188,"Thu, 31 Jan 2019 00:04:41 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
4,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47065972,Facebook users continue to grow despite privac...,[],Facebook users have continued to rise despite ...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47065972,https://www.bbc.co.uk/news/business-47065972,"Wed, 30 Jan 2019 22:25:12 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
5,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062145,Fed puts future rate rises on hold as pledges ...,[],The Federal Reserve has indicated it won't rai...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062145,https://www.bbc.co.uk/news/business-47062145,"Wed, 30 Jan 2019 21:17:01 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
6,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47063403,Bosses' group head in abrupt departure,[],The boss of one of the UK's largest business l...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47063403,https://www.bbc.co.uk/news/business-47063403,"Wed, 30 Jan 2019 18:19:35 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
7,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062142,Foxconn reconsiders Wisconsin factory plans,[],"Foxconn, which raised hopes of a US manufactur...",[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062142,https://www.bbc.co.uk/news/business-47062142,"Wed, 30 Jan 2019 19:41:24 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
8,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47062146,Tesla reports profit as issues stabilise,[],Tesla made a profit of $139.5m (£106.4m) in th...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47062146,https://www.bbc.co.uk/news/business-47062146,"Wed, 30 Jan 2019 23:51:34 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31
9,https://www.bbc.com/news,https://www.bbc.co.uk/news/business-47060676,Barclays shifts billions of pounds to Dublin b...,[],Barclays is moving €190bn (£166bn) of assets t...,[],[''],set([]),[],,...,https://www.bbc.co.uk/news/business-47060676,https://www.bbc.co.uk/news/business-47060676,"Wed, 30 Jan 2019 16:37:31 GMT","time.struct_time(tm_year=2019, tm_mon=1, tm_md...",NotFound,bbc_news_business,bbc,bbc_news_business,custom,2019-01-31


### Topic Modelling
(based on https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/):

In [18]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [19]:
%%time
# Basic text cleaning:
df_master['clean_text'] = df_master['text'].apply(functions.clean_text_string,
                                                  keep_dbl_newline=False)
cnn_nodupes_atall_texts_cleaned = []
for text in cnn_clean_nodupes_atall_texts:
    cnn_nodupes_atall_texts_cleaned.append(functions.clean_text_string(text, keep_dbl_newline=False))

CPU times: user 11.6 s, sys: 189 ms, total: 11.8 s
Wall time: 11.8 s


In [20]:
# Enable stop words, for later clearing:
stop_words = stopwords.words('english')
# "bbc radio live" appears often, so add that to stop words; 'may' also seem turn up a lot, but not mean much:
stop_words.extend(['also', 'bbc', 'radio', 'live', 'may'])

In [21]:
# Convert our text strings to list:
data = df_master.clean_text.values.tolist()
data.extend(cnn_nodupes_atall_texts_cleaned)

In [22]:
# Extra text cleaning, just in case:

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [23]:
data_words = list(functions.sent_to_words(data))

#### Remove Stopwords, Make Bigrams/Trigrams and Lemmatize:

In [24]:
# Define functions for stopwords, bigrams, trigrams
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [25]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [26]:
%%time
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=70) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_words], threshold=70)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 45.8 s, sys: 151 ms, total: 46 s
Wall time: 46 s


In [27]:
%%time
# Form Bigrams:
data_words_bigrams = make_bigrams(data_words_nostops)

CPU times: user 14.3 s, sys: 146 ms, total: 14.5 s
Wall time: 14.5 s


In [28]:
%%time
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

CPU times: user 25min 21s, sys: 1min 1s, total: 26min 22s
Wall time: 6min 40s


#### Create the Dictionary and Corpus needed for Topic Modeling:

In [29]:
%%time
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

CPU times: user 7.2 s, sys: 126 ms, total: 7.32 s
Wall time: 7.32 s


In [30]:
import time

#### Building LDA Model:

In [31]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        start = time.time()
        print('Topic modelling for num_topics=', num_topics)
        #model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        #model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        end = time.time()
        print('Finished num_topics', num_topics, 'with time', end - start)
        
    return model_list, coherence_values

In [32]:
%%time

# Can take a long time to run.
model_list2, coherence_values2 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=10, step=1)

Topic modelling for num_topics= 2


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 2 with time 306.1885087490082
Topic modelling for num_topics= 3
Finished num_topics 3 with time 310.4450078010559
Topic modelling for num_topics= 4
Finished num_topics 4 with time 334.1452827453613
Topic modelling for num_topics= 5
Finished num_topics 5 with time 369.00795006752014
Topic modelling for num_topics= 6
Finished num_topics 6 with time 367.96039485931396
Topic modelling for num_topics= 7
Finished num_topics 7 with time 403.13715291023254
Topic modelling for num_topics= 8
Finished num_topics 8 with time 440.6737549304962
Topic modelling for num_topics= 9
Finished num_topics 9 with time 476.2478747367859
CPU times: user 2h 43min 57s, sys: 7min 54s, total: 2h 51min 51s
Wall time: 50min 7s


In [33]:
%%time
model_list3, coherence_values3 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=10, limit=15, step=1)

Topic modelling for num_topics= 10


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 10 with time 513.1146211624146
Topic modelling for num_topics= 11
Finished num_topics 11 with time 587.534184217453
Topic modelling for num_topics= 12
Finished num_topics 12 with time 571.5289018154144
Topic modelling for num_topics= 13
Finished num_topics 13 with time 599.4308750629425
Topic modelling for num_topics= 14
Finished num_topics 14 with time 628.6747941970825
CPU times: user 2h 35min 3s, sys: 8min 45s, total: 2h 43min 49s
Wall time: 48min 20s


In [34]:
%%time
model_list4, coherence_values4 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=15, limit=21, step=1)

Topic modelling for num_topics= 15


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 15 with time 691.0951118469238
Topic modelling for num_topics= 16
Finished num_topics 16 with time 705.7277450561523
Topic modelling for num_topics= 17
Finished num_topics 17 with time 764.0610220432281
Topic modelling for num_topics= 18
Finished num_topics 18 with time 802.133266210556
Topic modelling for num_topics= 19
Finished num_topics 19 with time 842.1322269439697
Topic modelling for num_topics= 20
Finished num_topics 20 with time 836.864264011383
CPU times: user 4h 1min 16s, sys: 13min 38s, total: 4h 14min 54s
Wall time: 1h 17min 22s


In [57]:
%%time
model_list5, coherence_values5 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=21, limit=25, step=1)

Topic modelling for num_topics= 21


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 21 with time 836.3872618675232
Topic modelling for num_topics= 22
Finished num_topics 22 with time 882.4118032455444
Topic modelling for num_topics= 23
Finished num_topics 23 with time 941.8384079933167
Topic modelling for num_topics= 24
Finished num_topics 24 with time 1067.5023818016052
CPU times: user 2h 51min 41s, sys: 10min 31s, total: 3h 2min 12s
Wall time: 1h 2min 8s


In [58]:
%%time
model_list6, coherence_values6 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=25, limit=30, step=1)

Topic modelling for num_topics= 25


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 25 with time 1114.7858200073242
Topic modelling for num_topics= 26
Finished num_topics 26 with time 982.3471877574921
Topic modelling for num_topics= 27
Finished num_topics 27 with time 983.6640148162842
Topic modelling for num_topics= 28
Finished num_topics 28 with time 1026.3176009654999
Topic modelling for num_topics= 29
Finished num_topics 29 with time 1049.0155730247498
CPU times: user 3h 42min 50s, sys: 12min 13s, total: 3h 55min 4s
Wall time: 1h 25min 56s


In [67]:
%%time
model_list7, coherence_values7 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=30, limit=46, step=1)

Topic modelling for num_topics= 30


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Finished num_topics 30 with time 1263.7574751377106
Topic modelling for num_topics= 31
Finished num_topics 31 with time 1279.618304014206
Topic modelling for num_topics= 32
Finished num_topics 32 with time 1066.8624303340912
Topic modelling for num_topics= 33
Finished num_topics 33 with time 1150.6826491355896
Topic modelling for num_topics= 34
Finished num_topics 34 with time 1294.0996840000153
Topic modelling for num_topics= 35
Finished num_topics 35 with time 1459.2808079719543
Topic modelling for num_topics= 36
Finished num_topics 36 with time 1467.4793231487274
Topic modelling for num_topics= 37
Finished num_topics 37 with time 1598.359689950943
Topic modelling for num_topics= 38
Finished num_topics 38 with time 1523.7713379859924
Topic modelling for num_topics= 39
Finished num_topics 39 with time 1295.1958329677582
Topic modelling for num_topics= 40
Finished num_topics 40 with time 1332.480339050293
Topic modelling for num_topics= 41
Finished num_topics 41 with time 1398.60217595

In [68]:
models = model_list2 + model_list3 + model_list4 + model_list5 + model_list6 + model_list7
coherence_values = coherence_values2 + coherence_values3 + coherence_values4 + coherence_values5 + coherence_values6 + coherence_values7

In [69]:
%%time
for model in models:
    name = 'models/2019_04_16_optimal_model_topics' + str(model.num_topics) + '.model'
    model.save(name)

CPU times: user 3.65 s, sys: 3.9 s, total: 7.55 s
Wall time: 10.1 s


In [33]:
cohvals_path = 'models/2019_04_16_coherences_values.txt'

In [31]:
%%time
# Load models:
models = []
directory_list = sorted(listdir('models/'))
for file in directory_list:
    if file.startswith('2019_04_16_optimal_model_topics') and file.endswith('.model'):
        model = gensim.models.LdaModel.load('models/' + file)
        models.append(model)

CPU times: user 4.28 s, sys: 2.04 s, total: 6.31 s
Wall time: 7.66 s


In [32]:
%%time
# Recalculate coherence scores:
coherence_values = []
model_num = 0
for model in models:
    print('Model Num:', model_num)
    coherencemodel = CoherenceModel(model=model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
    model_num += 1

Model Num: 0
Model Num: 1
Model Num: 2
Model Num: 3
Model Num: 4
Model Num: 5
Model Num: 6
Model Num: 7
Model Num: 8
Model Num: 9
Model Num: 10
Model Num: 11
Model Num: 12
Model Num: 13
Model Num: 14
Model Num: 15
Model Num: 16
Model Num: 17
Model Num: 18
Model Num: 19
Model Num: 20
Model Num: 21
Model Num: 22
Model Num: 23
Model Num: 24
Model Num: 25
Model Num: 26
Model Num: 27
Model Num: 28
Model Num: 29
Model Num: 30
Model Num: 31
Model Num: 32
Model Num: 33
Model Num: 34
Model Num: 35
Model Num: 36
Model Num: 37
Model Num: 38
Model Num: 39
Model Num: 40
Model Num: 41
Model Num: 42
Model Num: 43
CPU times: user 5min 59s, sys: 53.9 s, total: 6min 53s
Wall time: 1h 17min 51s


In [34]:
coherence_values

[0.30473636911922386,
 0.3249027080728624,
 0.36982381703272493,
 0.37701630145289894,
 0.3854396728864671,
 0.4329426205990265,
 0.4216982813809473,
 0.4555881561662689,
 0.4648887682306257,
 0.48768835708871144,
 0.46718835265743025,
 0.4684259779219651,
 0.4914104782543593,
 0.5022809284997043,
 0.5222553455726212,
 0.48111508767007305,
 0.4986223190092909,
 0.510688908508308,
 0.5174032245139688,
 0.531568668993731,
 0.5038390875972081,
 0.5378038250959394,
 0.48129993778376196,
 0.5140223690687207,
 0.5028178343016804,
 0.4574762825080305,
 0.4771254938124307,
 0.49710498803257114,
 0.4910268470164026,
 0.4620912283112936,
 0.48919822358988385,
 0.4959023379074466,
 0.5056126999180991,
 0.49691247796758997,
 0.5051982593916104,
 0.49407509555352913,
 0.4917995221636159,
 0.4768102656090244,
 0.4990408325751924,
 0.5088295606208432,
 0.48737351666448814,
 0.501472764690317,
 0.47604787833479184,
 0.4870424415545807]

In [38]:
%%time
with open(cohvals_path, 'wb') as fp:   #Pickling
    pickle.dump(coherence_values, fp)

CPU times: user 382 µs, sys: 810 µs, total: 1.19 ms
Wall time: 964 µs


In [39]:
## Load Coherence values:
with open(cohvals_path, 'rb') as fp:
    coherence_values = pickle.load(fp)

In [40]:
len(coherence_values)

44

In [41]:
x = np.arange(2,46)

In [113]:
# Create a trace
trace = go.Scatter(
    line = dict(width=1),
    x = x,
    y = coherence_values
)

layout = go.Layout(
    title=go.layout.Title(
        text='Topic Coherence',
        xref='paper',
        y=.88,
        font=dict(size=20)
    ),
     xaxis=dict(
         dtick=1,
         tick0=0,
         tickfont=dict(size=8),
         title='Number of Topics',
         titlefont=dict(size=14),
         tickangle=-45,
     ),
     yaxis=dict(
         dtick=.02,
         tick0=0,
         title='CV Coherence',
         tickfont=dict(size=10)
     ),
    annotations=[
    dict(
        x=23,
        y=0.5378038,
        xref='x',
        yref='y',
        text='(23, 0.5378038)',
        showarrow=True,
        arrowhead=3,
        ax=0,
        ay=-40
    )]
)


data = [trace]
fig1 = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

pio.write_image(fig1, 'papers/presentation/topic_coherence.pdf')

In [131]:
df_cnn = pd.DataFrame(columns=df_master.columns)

In [132]:
df_cnn['clean_text'] = cnn_nodupes_atall_texts_cleaned

In [134]:
df_cnn['source'] = 'cnn'

In [166]:
cnn_nodupes_atall_texts_cleaned[-1]

''

In [140]:
print(len(cnn_nodupes_atall_texts_cleaned))
print(len(df_master))

3942
11603


In [143]:
df_master_wcnn = pd.concat([df_master, df_cnn], ignore_index=True)

In [145]:
df_master_wcnn.to_csv('data/processed/2019_04_17_AllWCNN.csv', index=False)

### Adding dominant topics to data:

In [149]:
optimal_model = models[21] ## Model with 23 topics

In [150]:
%%time
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 17.9 µs


In [151]:
%%time
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

CPU times: user 37min 43s, sys: 3min 12s, total: 40min 55s
Wall time: 10min 45s


In [156]:
topic_modeling_df = pd.concat([df_master_wcnn, df_dominant_topic[['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']]], axis=1)

In [175]:
topic_modeling_df_dropped = topic_modeling_df.drop(df_master_wcnn[df_master_wcnn.clean_text == ''].index)

In [178]:
topic_modeling_df_dropped.to_csv('data/processed/2019_04_17_AllWCNN_WTopics_noemptystr.csv', index=False)

#### Visualize Topics:

In [196]:
cnn_filter = (topic_modeling_df_dropped.source == 'cnn')
fox_filter = (topic_modeling_df_dropped.source == 'fox')
rt_filter = (topic_modeling_df_dropped.source == 'rt')
bbc_filter = (topic_modeling_df_dropped.source == 'bbc')

In [216]:
x = np.random.randn(500)
trace1 = go.Histogram(x=topic_modeling_df_dropped[cnn_filter]['Dominant_Topic'], opacity=0.75, name='CNN',
                      marker=dict(
                          line=dict(width=.8)))
trace2 = go.Histogram(x=topic_modeling_df_dropped[fox_filter]['Dominant_Topic'], opacity=0.75, name='Fox News',
                      marker=dict(
                          line=dict(width=.8)))
trace3 = go.Histogram(x=topic_modeling_df_dropped[rt_filter]['Dominant_Topic'], opacity=0.75, name='RT',
                      marker=dict(
                          line=dict(width=.8)))
trace4 = go.Histogram(x=topic_modeling_df_dropped[bbc_filter]['Dominant_Topic'], opacity=0.75, name='BBC',
                      marker=dict(
                          line=dict(width=.8)))

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1
    ),
    title=go.layout.Title(
        text='Topic Distributions by Source',
        xref='paper',
        y=.88,
        font=dict(size=20)
    )
)

fig2 = tools.make_subplots(rows=2, cols=2)
fig2.append_trace(trace1, 1, 1)
fig2.append_trace(trace2, 1, 2)
fig2.append_trace(trace3, 2, 1)
fig2.append_trace(trace4, 2, 2)
fig2['layout'].update(title='Topic Distribution by Source',
                     xaxis1=dict(
                        tickmode='linear',
                        ticks='outside',
                        tick0=0,
                        dtick=1,
                         tickfont=dict(
            size=7
        )
                     ), 
                    yaxis1=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis2=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
                         tickfont=dict(
            size=7
        )
    ),yaxis2=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis3=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis3=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
                     xaxis4=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
         title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis4=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)))

plotly.offline.iplot(fig2, filename='basic histogram')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



In [217]:
pio.write_image(fig2, 'papers/presentation/topic_distribution.pdf')

### Interpret Topics:

In [231]:
#%%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
vis

In [342]:
topic_filter = (topic_modeling_df_dropped['Dominant_Topic'] == 23)
lookattopic = topic_modeling_df_dropped[['title','Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'clean_text']][topic_filter].sort_values(by=['Topic_Perc_Contrib'], ascending=False)
lookattopic.head(40)

Unnamed: 0,title,Dominant_Topic,Topic_Perc_Contrib,Keywords,clean_text


In [341]:
peak_idx = 2151
print(lookattopic.Keywords[peak_idx])
print(lookattopic.clean_text[peak_idx])

water, find, climate_change, wind, human, scientist, animal, food, storm, fish
More than a hundred insect species that are new to science have been discovered on an Indonesian island. Found in remote rainforests, the tiny beetles appear to have been overlooked for decades. All 103 belong to the same group - weevils. Scientists have named the creatures after Star Wars and Asterix characters, including Yoda, a green shiny beetle, and Obelix, a rather rotund specimen. Others have been named after scientists, including Charles Darwin, and DNA pioneers, Francis Crick and James Watson. The beetles are only a few millimetres in length. Only a single member of their insect group has been found before on Sulawesi - as long ago as 1885. The island, known for its exotic wildlife, including birds and monkeys, is covered by lowland rainforests, although much of this has been cleared. The researchers say there may be more of the beetles out there. "Our survey is not yet complete and possibly we have

In [343]:
topic_translation = {0: 'Unknown',
                     1: 'Sports',
                     2: 'Political Investigation (Cohen/Virginia)',
                     3: 'Military Nuclear Technology',
                     4: 'Film Industry',
                     5: 'Cybersecurity',
                     6: 'Entertainment Industry',
                     7: 'Health',
                     8: 'Brexit',
                     9: 'Space Industy',
                     10: 'US Campaign Politics',
                     11: 'Medical Research',
                     12: 'Economy',
                     13: 'Emergencies',
                     14: 'Religious Intolerance',
                     15: 'Family Affairs',
                     16: 'International Affairs (Middle East and South Asia)',
                     17: 'International Affairs (Latin America)',
                     18: 'Corporate News',
                     19: 'Fashion/Issues of Race',
                     20: 'Crime',
                     21: 'Social Media',
                     22: 'Nature and Environment'}

In [345]:
%%time

# Save Topic Dictionary:
topic_dict_path = 'models/2019_04_16_Topic_Dictionary(23_topics)'
with open(topic_dict_path, 'wb') as fp:   #Pickling
    pickle.dump(topic_translation, fp)

CPU times: user 260 µs, sys: 1.27 ms, total: 1.53 ms
Wall time: 2.57 ms


In [350]:
%%time
topic_modeling_df_dropped['Topic'] = topic_modeling_df_dropped['Dominant_Topic'].map(topic_translation)

CPU times: user 4.25 ms, sys: 770 µs, total: 5.02 ms
Wall time: 4.49 ms


In [354]:
# Save results:
topic_modeling_df_dropped.to_csv('data/processed/2019_04_17_AllWCNN_WTopics_noemptystr_inttopics.csv', index=False)

In [356]:
# TODO: Historgram of topic counts by source

### Vectorization:

In [159]:
%%time
# https://stackoverflow.com/questions/42094180/spacy-how-to-load-google-news-word2vec-vectors
import gensim
import spacy

# Path to google news vectors
google_news_path = "data/word_embeddings/GoogleNews-vectors-negative300.bin.gz"

# Load google news vecs in gensim
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_path, binary=True)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = []
for idx in range(3000000):
    keys.append(model.index2word[idx])

# Set the vectors for our nlp object to the google news vectors
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

CPU times: user 2min 57s, sys: 17.9 s, total: 3min 15s
Wall time: 3min 19s


In [355]:
%%time
# Vectorize texts:
vectorized_texts = nlp(topic_modeling_df_dropped['clean_text'][0]).vector
idx = 1
for text in topic_modeling_df_dropped[1:]['clean_text']:
    vector = nlp(text).vector
    vectorized_texts = np.vstack([vectorized_texts, vector])

CPU times: user 25min 49s, sys: 11min 58s, total: 37min 48s
Wall time: 12min 40s


In [365]:
%%time
# Calculate cosine similarity matrix (n_jobs = -1 means use all CPU cores):
cos_sim_mat = (pairwise_distances(vectorized_texts, metric='cosine', n_jobs = -1))

CPU times: user 4.38 s, sys: 1min 46s, total: 1min 51s
Wall time: 1min 59s


In [369]:
cos_sim_mat.shape

(15500, 15500)

In [370]:
def make_similarity_df(cnn_idx_arr, target_source_idx_arr, target_article_category, topic, cos_sim_mat):
    """
    """
    similarity_df = pd.DataFrame(columns=['cnn_article', 'target_article', 
                                          'target_article_category', 'topic', 'cos_similarity'])
    for cnn_idx in cnn_idx_arr:
        for target_idx in target_source_idx_arr:
            row = {'cnn_article': cnn_idx,
                   'target_article': target_idx,
                   'target_article_category': target_article_category,
                   'topic': topic,
                   'cos_similarity': cos_sim_mat[cnn_idx, target_idx]}
            similarity_df = similarity_df.append(row, ignore_index=True)
    return similarity_df

In [381]:
topics = topic_modeling_df_dropped.Topic.unique()
sources = topic_modeling_df_dropped.source.unique()
baseline_source = 'cnn'

In [379]:
cos_bias_df = pd.DataFrame(columns=['Topic', 'Baseline Source', 'Baseline Article Index', 'Target Source', 'Target Source Article Index', 'Bias'])

In [391]:
all_topic_matrices = []
all_topic_indx = np.arange(0, 23)
print(all_topic_indx)
for topic_idx in all_topic_indx:
    topic_filter = (topic_modeling_df_dropped.Dominant_Topic == all_topic_indx[topic_idx])
    idx_thistopic = topic_modeling_df_dropped[topic_filter].index.values
    
    baseline_filter = (topic_modeling_df_dropped.source == baseline_source)
    idx_baseline = 
    
    for current_topic_idx in idx_thistopic:
        other_sources_idxs = 
        print(current_topic_idx)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
22
27
28
30
32
33
35
36
37
41
42
44
46
47
49
52
53
54
55
57
61
62
64
65
66
67
76
78
84
86
89
90
91
92
93
94
95
97
98
109
110
112
113
114
116
119
129
131
146
154
156
157
163
165
167
170
178
182
193
196
202
204
210
211
213
214
217
218
226
230
251
252
255
256
257
260
265
269
273
277
278
279
281
289
291
296
301
304
312
316
318
320
321
327
329
330
332
346
347
362
366
371
374
385
390
391
392
393
396
399
400
401
402
404
407
411
412
413
414
415
417
419
420
422
423
426
431
432
434
438
440
441
443
449
450
452
453
454
455
457
460
461
462
468
473
476
477
478
480
483
496
498
499
500
513
518
525
528
530
531
532
533
535
536
538
539
541
543
544
547
548
549
551
555
556
558
560
561
562
563
565
609
611
612
616
619
620
624
625
626
629
630
634
639
645
646
648
652
656
660
669
671
673
674
675
686
689
691
711
713
715
716
717
719
724
725
727
729
740
741
743
744
745
746
748
751
752
753
756
757
758
761
765
767
769
772
773
774
777
778
784
792


11668
11672
11676
11678
11681
11682
11683
11684
11686
11688
11689
11695
11696
11697
11699
11705
11707
11716
11723
11728
11729
11731
11732
11733
11734
11735
11736
11737
11738
11739
11741
11742
11743
11747
11748
11749
11750
11752
11754
11755
11759
11762
11764
11776
11778
11779
11781
11783
11785
11786
11787
11788
11789
11791
11792
11793
11794
11796
11799
11803
11804
11805
11806
11808
11811
11812
11814
11815
11816
11826
11836
11842
11845
11849
11851
11852
11853
11856
11857
11873
11877
11882
11885
11887
11889
11890
11891
11893
11897
11898
11899
11903
11904
11905
11909
11910
11913
11914
11915
11916
11918
11919
11920
11922
11931
11934
11940
11946
11948
11951
11955
11958
11959
11961
11962
11963
11964
11965
11967
11973
11976
11981
11982
11983
11984
11986
11987
11988
11990
11991
11994
11999
12000
12001
12005
12007
12008
12012
12017
12019
12020
12023
12024
12027
12030
12042
12052
12057
12062
12063
12065
12076
12077
12078
12086
12087
12088
12089
12096
12097
12100
12103
12108
12110
12111
12112
1211

2010
2011
2129
2132
2133
2134
2137
2215
2291
2292
2294
2295
2319
2387
2388
2389
2391
2407
2411
2458
2471
2472
2473
2475
2529
2535
2538
2539
2540
2541
2542
2548
2596
2597
2599
2600
2601
2617
2629
2693
2695
2696
2769
2782
2785
2787
2855
2861
2864
2865
2891
2958
2963
2985
3035
3039
3062
3093
3095
3098
3105
3107
3111
3151
3174
3180
3181
3182
3183
3223
3234
3235
3236
3237
3238
3239
3278
3279
3280
3281
3283
3284
3350
3353
3415
3419
3420
3421
3422
3423
3486
3487
3491
3556
3558
3559
3562
3568
3596
3612
3614
3615
3625
3701
3702
3705
3706
3708
3709
3711
3712
3716
3718
3720
3722
3793
3805
3825
3827
3828
3832
3835
3840
3842
3851
3920
3928
3960
3963
3973
3974
3976
3977
4021
4022
4029
4056
4057
4058
4139
4144
4244
4248
4249
4347
4351
4454
4456
4458
4459
4561
4563
4566
4567
4570
4572
4683
4688
4689
4693
4695
4803
4822
4832
4892
4897
4920
4921
4922
5021
5022
5023
5024
5025
5029
5031
5126
5127
5128
5131
5133
5134
5135
5229
5230
5336
5337
5341
5342
5344
5347
5349
5350
5442
5443
5534
5540
5541
5637
5638


8167
8180
8181
8182
8186
8193
8196
8199
8200
8206
8210
8216
8220
8249
8261
8291
8293
8301
8305
8327
8328
8329
8369
8381
8391
8395
8401
8404
8430
8440
8443
8444
8447
8448
8456
8470
8471
8502
8506
8508
8513
8516
8520
8533
8539
8540
8561
8573
8594
8596
8606
8612
8616
8625
8631
8660
8663
8701
8703
8729
8750
8771
8785
8800
8804
8810
8818
8822
8838
8844
8848
8854
8862
8869
8873
8876
8878
8889
8893
8899
8916
8931
8938
8956
8966
8979
8983
8988
9006
9009
9011
9015
9017
9029
9034
9041
9070
9073
9076
9088
9094
9098
9102
9104
9107
9130
9138
9139
9147
9151
9157
9177
9182
9189
9190
9191
9198
9200
9205
9207
9210
9231
9234
9236
9238
9242
9251
9260
9268
9278
9289
9297
9300
9301
9309
9310
9312
9335
9340
9348
9354
9357
9368
9376
9383
9399
9404
9407
9408
9428
9435
9441
9458
9460
9461
9476
9491
9503
9507
9511
9518
9538
9548
9555
9565
9588
9591
9598
9602
9614
9622
9636
9641
9644
9663
9685
9702
9708
9717
9729
9733
9736
9737
9746
9750
9757
9760
9773
9782
9785
9786
9792
9793
9799
9805
9812
9814
9820
9829
9873


12254
12332
12351
12389
12409
12423
12462
12463
12464
12539
12565
12639
12647
12668
12681
12683
12686
12687
12798
12901
12908
12998
12999
13048
13149
13157
13167
13178
13214
13277
13329
13381
13457
13464
13466
13467
13468
13505
13528
13544
13554
13561
13641
13645
13663
13714
13740
13767
13843
13875
13877
13880
13966
14005
14022
14027
14029
14042
14066
14139
14145
14235
14236
14316
14332
14333
14400
14413
14446
14448
14450
14453
14522
14533
14621
14632
14633
14634
14637
14639
14641
14650
14739
14741
14743
14744
14747
14765
14813
14830
14836
14846
14859
14939
14944
14977
14996
15072
15121
15133
15170
15182
15184
15251
15328
15339
15375
15394
15418
15420
15433
15435
58
192
307
474
537
824
867
1449
1482
1483
1901
1943
2089
2871
2919
3008
3128
3462
3951
4108
4154
4355
4617
4733
4833
4836
5140
5224
5489
5585
6122
6190
6461
6660
6888
7089
9004
10011
11720
11721
11974
12102
12186
12352
12505
12743
13371
13469
13558
13624
13678
14025
14578
14636
14903
14907
14948
14978
15181
15253
73
79
268
275

386
387
388
394
442
445
451
459
514
520
522
524
526
527
529
571
592
594
599
600
604
607
608
610
649
653
661
680
695
698
699
700
701
702
703
705
708
709
720
730
770
800
803
807
808
809
812
862
891
900
903
906
907
912
973
997
1006
1007
1009
1013
1018
1019
1123
1124
1127
1129
1158
1204
1206
1258
1264
1265
1300
1316
1317
1321
1368
1377
1380
1381
1382
1383
1384
1385
1386
1388
1460
1466
1468
1470
1472
1473
1510
1537
1538
1539
1541
1548
1550
1552
1600
1605
1644
1648
1649
1707
1708
1709
1711
1714
1715
1716
1724
1793
1797
1798
1803
1804
1815
1890
1893
1896
1897
1900
1902
1904
1962
1963
1964
1967
1968
1970
1971
1972
1973
2021
2044
2048
2053
2055
2056
2057
2058
2059
2072
2073
2074
2075
2127
2143
2148
2153
2158
2159
2163
2207
2208
2223
2224
2227
2231
2232
2239
2241
2243
2244
2287
2299
2302
2305
2335
2337
2338
2341
2343
2344
2346
2347
2348
2349
2351
2352
2355
2432
2434
2435
2511
2527
2557
2559
2567
2570
2584
2635
2638
2644
2646
2647
2652
2654
2702
2734
2739
2744
2746
2747
2779
2789
2803
2818
2819
2

6639
6641
6645
6690
6695
6698
6699
6702
6737
6751
6805
6845
6924
6960
6990
7004
7006
7007
7010
7075
7180
7187
7258
7262
7264
7329
7335
7341
7374
7397
7410
7427
7432
7436
7522
7527
7528
7529
7535
7538
7578
7638
7674
7722
7728
7729
7741
7808
7817
7820
7827
7835
7852
7853
7885
7908
7920
7934
7978
8041
8114
8121
8137
8147
8160
8208
8213
8215
8234
8237
8241
8242
8246
8248
8324
8363
8389
8410
8423
8424
8425
8433
8436
8451
8461
8464
8491
8494
8496
8501
8512
8531
8553
8570
8571
8577
8582
8583
8590
8592
8593
8595
8618
8619
8635
8639
8649
8652
8653
8664
8667
8702
8731
8740
8748
8763
8765
8776
8787
8792
8796
8819
8825
8831
8835
8841
8858
8860
8879
8880
8896
8902
8906
8913
8920
8925
8934
8936
8939
8948
8958
8972
8974
8981
8989
8993
9032
9035
9036
9037
9039
9060
9063
9071
9079
9084
9085
9095
9109
9117
9144
9153
9155
9158
9165
9173
9212
9220
9229
9230
9239
9245
9254
9258
9288
9307
9318
9323
9325
9329
9343
9361
9386
9391
9393
9400
9421
9422
9425
9433
9438
9443
9448
9450
9454
9480
9486
9487
9493
9499


1269
1458
1464
1471
1540
1647
1657
1799
1889
1895
1899
2054
2071
2222
2235
2345
2350
2429
2512
2564
2566
2634
2639
2650
2740
2792
2897
2898
2900
2911
3117
3119
3304
3306
3397
3399
3455
3517
3530
3652
3656
3779
3955
4060
4150
4202
4212
4320
4420
4608
4808
5201
5339
5372
5499
5592
5600
5740
5888
6023
6024
6029
6138
6334
6512
6680
6823
6945
6953
7024
7361
7371
7503
7667
7670
7790
7837
8308
8478
8699
9119
9311
9474
9506
9646
9754
9822
9893
10413
10429
10462
11414
11658
11921
11943
12067
12133
12160
12161
12353
12410
12764
12770
12860
12888
12989
13024
13104
13282
13292
13427
13442
13508
13521
13529
13627
13734
13868
14461
14465
14552
14662
14677
14755
14756
14763
14942
14976
14980
14989
15106
15260
15336
15338
15417
15486
59
266
349
361
572
672
902
925
1389
1395
1586
1741
1762
2493
2618
3133
3148
3289
3443
3514
3698
3871
4111
4219
4341
4446
4889
5086
5097
5275
5299
5424
5880
5944
6129
6396
7110
7525
7594
7695
7698
7876
8263
8692
8764
9132
9204
9851
10097
11099
11420
11515
11670
12006
12084

In [390]:
topic_modeling_df_dropped[topic_modeling_df_dropped.Dominant_Topic == all_topic_indx[21]].index.values

array([   72,    82,   174,   267,   334,   491,   545,   728,   766,
         982,  1459,  1546,  1719,  1860,  1861,  2016,  2424,  2488,
        2649,  2840,  2972,  3254,  3980,  4039,  4063,  4148,  4837,
        5354,  5451,  5545,  5839,  6015,  6287,  6288,  6565,  6893,
        6932,  7151,  7312,  7330,  7607,  7943,  8132,  8169,  8323,
        8396,  8488,  8598,  8624,  8865,  8977,  9163,  9525,  9527,
        9584,  9876,  9960, 10086, 10089, 10326, 10353, 10397, 10636,
       10760, 10844, 10882, 10988, 11115, 11165, 11257, 11344, 11652,
       11923, 12246, 12298, 12358, 12359, 12508, 13279, 13737, 13794,
       13867, 13970, 14146, 14208, 14238, 14239, 14269, 14284, 14372,
       14745, 14835, 15183])

In [382]:
sum(topic_modeling_df_dropped.source == baseline_source)

3897

In [383]:
sum(topic_modeling_df_dropped.source != baseline_source)

11603

In [384]:
11603 * 3897

45216891

In [385]:
cos_sim_mat.shape

(15500, 15500)

In [386]:
15500 ** 2

240250000