In [1]:
%load_ext autoreload
%autoreload 2

import unicodecsv as csv
import datetime
import feedparser as fp
import newspaper
import helpers
from newspaper import Source, Article
import multiprocessing 
import pandas as pd
import numpy as np
import dask as da
from sklearn.metrics import pairwise_distances
import multiprocessing
from multiprocessing import cpu_count

In [2]:
%%time
# Load GoogleNews word2vec-trained word embeddings:
# (https://stackoverflow.com/questions/42094180/spacy-how-to-load-google-news-word2vec-vectors)
import gensim
import spacy

# Path to google news vectors
google_news_path = "data/word_embeddings/GoogleNews-vectors-negative300.bin.gz"

# Load google news vecs in gensim
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_path, binary=True)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = []
for idx in range(3000000):
    keys.append(model.index2word[idx])

# Set the vectors for our nlp object to the google news vectors
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

CPU times: user 3min 17s, sys: 11 s, total: 3min 28s
Wall time: 3min 32s




In [3]:
%%time
topic_modeling_df = pd.read_csv('data/processed/2019_02_08_with_topics.csv', index_col=0)

CPU times: user 22.8 s, sys: 13.5 s, total: 36.2 s
Wall time: 3min 26s


In [390]:
print(max(pd.to_datetime(topic_modeling_df.publish_date)))
print(min(pd.to_datetime(topic_modeling_df.publish_date)))

2019-04-02 00:00:00
2012-02-24 18:57:14


In [10]:
doc_1 = nlp(topic_modeling_df['clean_text'][0])
doc_2 = nlp(topic_modeling_df['clean_text'][1])
doc_3 = nlp(topic_modeling_df['clean_text'][2])

In [122]:
doc_1.similarity(doc_3)

0.9233324746108649

In [40]:
test_mat = np.stack([doc_1.vector, doc_2.vector, doc_3.vector])

In [87]:
test_mat

array([[ 1.23407552e-02,  3.27097550e-02,  1.53541816e-02,
         5.97806275e-02, -5.11604510e-02, -1.46879405e-02,
         1.25913471e-02, -6.19061217e-02,  7.19979778e-02,
         5.98698668e-02, -3.60981375e-02, -5.48369773e-02,
        -6.62904978e-03,  1.36261294e-02, -7.50808492e-02,
         4.07561064e-02,  2.83969548e-02,  4.98225875e-02,
        -1.74383912e-03, -3.41415629e-02, -3.01887859e-02,
         2.03090087e-02,  7.46628176e-03, -1.64851211e-02,
         4.73413877e-02,  5.49676595e-03, -6.09366782e-02,
         3.77761498e-02,  1.04446271e-02,  1.77859142e-02,
         6.75854599e-03, -2.32983641e-02, -2.88163163e-02,
        -1.48553820e-02,  2.41592359e-02, -8.83720070e-03,
         3.13260267e-03, -1.32394535e-02,  2.46355273e-02,
         3.89034636e-02,  7.02875480e-02, -2.11812146e-02,
         4.15381864e-02, -1.82093948e-03, -2.95656361e-02,
        -3.35123017e-02, -2.45068539e-02,  3.28406021e-02,
        -2.01430381e-03,  1.49084665e-02,  2.47009136e-0

In [5]:
%%time
# Vectorize texts:
vectorized_texts = nlp(topic_modeling_df['clean_text'][0]).vector
idx = 1
for text in topic_modeling_df[1:]['clean_text']:
    vector = nlp(text).vector
    vectorized_texts = np.vstack([vectorized_texts, vector])

CPU times: user 4min, sys: 21.5 s, total: 4min 21s
Wall time: 2min 19s


In [6]:
vectorized_texts.shape

(3327, 300)

In [7]:
%%time
# Calculate cosine similarity matrix (n_jobs = -1 means use all CPU cores):
cos_sim_mat = (1 - pairwise_distances(vectorized_texts, metric='cosine', n_jobs = -1))

CPU times: user 1.08 s, sys: 2.31 s, total: 3.39 s
Wall time: 8 s


In [8]:
# Check shape:
cos_sim_mat.shape

(3327, 3327)

In [9]:
# Save the similarity matrix, just in case:
np.save('data/processed/2019_03_20_cosine_similarity_matrix', cos_sim_mat)

### Calculate averages of distances, conditional on sources and topics:

In [10]:
rt_df = topic_modeling_df[topic_modeling_df['source'] == 'rt']
cnn_df = topic_modeling_df[topic_modeling_df['source'] == 'cnn']
fox_df = topic_modeling_df[topic_modeling_df['source'] == 'fox']
bbc_df = topic_modeling_df[topic_modeling_df['source'] == 'bbc']

In [15]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import plotly
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode
init_notebook_mode(connected=True)

x = np.random.randn(500)
trace1 = go.Histogram(x=cnn_df['Dominant_Topic'], opacity=0.75, name='CNN')
trace2 = go.Histogram(x=fox_df['Dominant_Topic'], opacity=0.75, name='Fox News')
trace3 = go.Histogram(x=rt_df['Dominant_Topic'], opacity=0.75, name='RT')
trace4 = go.Histogram(x=bbc_df['Dominant_Topic'], opacity=0.75, name='BBC')

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1
    ),
    title='Topic Distributions by Source'
)
#fig = go.Figure(layout=layout)
fig = tools.make_subplots(rows=2, cols=2)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig['layout'].update(title='Topic Distribution by Source',
                     xaxis1=dict(
                        tickmode='linear',
                        ticks='outside',
                        tick0=0,
                        dtick=1,
                         tickfont=dict(
            size=7
        )
                     ), 
                    yaxis1=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis2=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
                         tickfont=dict(
            size=7
        )
    ),yaxis2=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)),
         xaxis3=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis3=dict(range=[0, 250], title='Count', 
                  dtick=25, tickfont=dict(
            size=7)),
                     xaxis4=dict(
        tickmode='linear',
        ticks='outside',
        tick0=0,
        dtick=1,
         title='Topic',
                         tickfont=dict(
            size=7
        )
    ),yaxis4=dict(range=[0, 250], 
                  dtick=25, tickfont=dict(
            size=7)))

plotly.offline.iplot(fig, filename='basic histogram')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



In [16]:
topic_modeling_df.groupby(['Dominant_Topic']).count()

Unnamed: 0_level_0,source_url,url,title,movies,text,keywords,meta_keywords,tags,authors,publish_date,...,rss_published_parsed,rss_feedburner_origlink,paper_section_name,source,source_detail,pull_type,pull_date,clean_text,Topic_Perc_Contrib,Keywords
Dominant_Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,51,51,51,51,51,51,51,51,51,50,...,51,51,51,51,51,51,51,51,51,51
1.0,170,170,170,170,170,170,170,170,170,170,...,170,170,170,170,170,170,170,170,170,170
2.0,92,92,92,92,92,92,92,92,92,87,...,92,92,92,92,92,92,92,92,92,92
3.0,147,147,147,147,147,147,147,147,147,146,...,147,147,147,147,147,147,147,147,147,147
4.0,58,58,58,58,58,58,58,58,58,54,...,58,58,58,58,58,58,58,58,58,58
5.0,106,106,106,106,106,106,106,106,106,103,...,106,106,106,106,106,106,106,106,106,106
6.0,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
7.0,339,339,339,339,339,339,339,339,339,335,...,339,339,339,339,339,339,339,339,339,339
8.0,133,133,133,133,133,133,133,133,133,131,...,133,133,133,133,133,133,133,133,133,133
9.0,52,52,52,52,52,52,52,52,52,52,...,52,52,52,52,52,52,52,52,52,52


In [86]:
topic_0_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 0]  # EU
topic_1_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 1]  # Venezuela politics
topic_2_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 2]  # Social Media
topic_3_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 3]  # Russia/China/Middle East
topic_4_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 4]  # Film
topic_5_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 5]  # Science
topic_6_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 6]  # Academia
topic_7_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 7]  # Crime # Try this
topic_8_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 8]  # IT Companies # Try this
topic_9_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 9]  # Migration/Religion ?
topic_10_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 10]  # Market News
topic_11_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 11]  # Sports
topic_12_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 12]  #Virginia Scandal
topic_13_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 13]  #?
topic_14_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 14]  #?
topic_15_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 15]  # Healthcare
topic_17_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 17]  # US Politics # Try this
topic_18_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 18]  # ?
topic_19_df = topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 19]  # Sports & Entertainment

In [84]:
for item in topic_modeling_df[topic_modeling_df['Dominant_Topic'] == 19]['title'].head(10):
    print(item)

David Bowie: Rare Ziggy Stardust footage hailed as the 'holy grail'
Senator Rand Paul wins damages after neighbour attack
TV producer fired calling NFL's Tom Brady a 'known cheater'
Maroon 5 letting Super Bowl show speak for itself
Gwyneth Paltrow's rep disputes ski-crash suit
13 great places to celebrate Lunar New Year
Roger Goodell: NFL Teams Would Sign Colin Kaepernick If He Could Help Them Win
NFL on blown Saints call: 'Our officials are human'
Banana Island: Luxury resort off Doha coast of Qatar
Hamilton Sevens: Classy Fiji condemns USA to third straight final defeat


In [85]:
(topic_19_df.groupby('source').count())

Unnamed: 0_level_0,source_url,url,title,movies,text,keywords,meta_keywords,tags,authors,publish_date,...,rss_published_parsed,rss_feedburner_origlink,paper_section_name,source_detail,pull_type,pull_date,clean_text,Dominant_Topic,Topic_Perc_Contrib,Keywords
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bbc,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
cnn,52,52,52,52,52,52,52,52,52,51,...,52,52,52,52,52,52,52,52,52,52
fox,82,82,82,82,82,82,82,82,82,75,...,82,82,82,82,82,82,82,82,82,82
rt,25,25,25,25,25,25,25,25,25,25,...,25,25,25,25,25,25,25,25,25,25


In [17]:
# Load the LDA model:
optimal_model = gensim.models.LdaModel.load('models/2019_02_08_optimal_model.model')

In [29]:
optimal_model.print_topics()

[(0,
  '0.023*"france" + 0.018*"french" + 0.017*"plane" + 0.015*"aircraft" + 0.014*"macron" + 0.012*"flight" + 0.011*"italy" + 0.011*"soldier" + 0.011*"german" + 0.011*"germany"'),
 (1,
  '0.024*"country" + 0.018*"government" + 0.016*"venezuela" + 0.013*"president" + 0.008*"maduro" + 0.008*"support" + 0.008*"leader" + 0.008*"military" + 0.007*"state" + 0.006*"people"'),
 (2,
  '0.019*"post" + 0.012*"show" + 0.011*"write" + 0.011*"twitter" + 0.011*"tweet" + 0.010*"photo" + 0.009*"social_media" + 0.009*"picture" + 0.008*"claim" + 0.008*"appear"'),
 (3,
  '0.027*"russia" + 0.018*"russian" + 0.014*"chinese" + 0.014*"china" + 0.013*"syria" + 0.013*"military" + 0.010*"country" + 0.009*"official" + 0.008*"iran" + 0.008*"moscow"'),
 (4,
  '0.030*"film" + 0.015*"star" + 0.014*"show" + 0.007*"actor" + 0.007*"award" + 0.007*"movie" + 0.006*"viewer" + 0.006*"character" + 0.006*"director" + 0.006*"gold"'),
 (5,
  '0.018*"study" + 0.013*"year" + 0.010*"find" + 0.009*"researcher" + 0.008*"increase" +

In [91]:
print(topic_modeling_df.clean_text[150])

Britain's new research agency has announced it will spend £279m on new non-EU international collaborations. The bulk of the money will go towards projects with scientists in developing countries to support humanitarian efforts. But critics say that political priorities may be being funded at the potential cost of "blue skies" study. The type of funding marks a new approach to increasing support for basic research that benefits society. The chief executive of the UK Research and Innovation (UKRI) agency, which funds the science, Prof Sir Mark Walport, said: "From tackling climate change to preventing and treating infectious diseases, the search for knowledge is a global endeavour that requires collaboration between the world's best minds." But James Wilsdon, who is professor of research policy at the University of Sheffield, described the announcements as "a touch disingenuous". While supportive of the overall goals of UKRI's international funding, he said: "Let's call a spade a spade. 

### Statistical Analysis


In [46]:
cnn_topic7_idx = topic_7_df[topic_7_df['source'] == 'cnn'].index.values
fox_topic7_idx = topic_7_df[topic_7_df['source'] == 'fox'].index.values
rt_topic7_idx = topic_7_df[topic_7_df['source'] == 'rt'].index.values

In [47]:
print(cnn_topic7_idx.shape)
print(fox_topic7_idx.shape)

(93,)
(120,)


In [294]:
def make_similarity_df(cnn_idx_arr, target_source_idx_arr, target_article_category, topic, cos_sim_mat):
    """
    """
    similarity_df = pd.DataFrame(columns=['cnn_article', 'target_article', 
                                          'target_article_category', 'topic', 'cos_similarity'])
    for cnn_idx in cnn_idx_arr:
        for target_idx in target_source_idx_arr:
            row = {'cnn_article': cnn_idx,
                   'target_article': target_idx,
                   'target_article_category': target_article_category,
                   'topic': topic,
                   'cos_similarity': cos_sim_mat[cnn_idx, target_idx]}
            similarity_df = similarity_df.append(row, ignore_index=True)
    return similarity_df

In [303]:
%%time
topic7_similarity_cnn_fox_df = make_similarity_df(cnn_idx_arr=cnn_topic7_idx, 
                                                  target_source_idx_arr=fox_topic7_idx, 
                                                  target_article_category='fox',
                                                  topic=7, 
                                                  cos_sim_mat=cos_sim_mat)
topic7_similarity_cnn_rt_df = make_similarity_df(cnn_idx_arr=cnn_topic7_idx, 
                                                  target_source_idx_arr=rt_topic7_idx, 
                                                  target_article_category='rt',
                                                  topic=7, 
                                                  cos_sim_mat=cos_sim_mat)

CPU times: user 1min 36s, sys: 259 ms, total: 1min 36s
Wall time: 1min 36s


In [309]:
master_df = topic7_similarity_cnn_fox_df.append(topic7_similarity_cnn_rt_df, ignore_index=True)

In [536]:
cnn_topic17_idx = topic_17_df[topic_17_df['source'] == 'cnn'].index.values
fox_topic17_idx = topic_17_df[topic_17_df['source'] == 'fox'].index.values
rt_topic17_idx = topic_17_df[topic_17_df['source'] == 'rt'].index.values

In [540]:
%%time
topic17_similarity_cnn_fox_df = make_similarity_df(cnn_idx_arr=cnn_topic17_idx, 
                                                  target_source_idx_arr=fox_topic17_idx, 
                                                  target_article_category='fox',
                                                  topic=17, 
                                                  cos_sim_mat=cos_sim_mat)
topic17_similarity_cnn_rt_df = make_similarity_df(cnn_idx_arr=cnn_topic17_idx, 
                                                  target_source_idx_arr=rt_topic17_idx, 
                                                  target_article_category='rt',
                                                  topic=17, 
                                                  cos_sim_mat=cos_sim_mat)

CPU times: user 47.5 s, sys: 222 ms, total: 47.7 s
Wall time: 47.8 s


In [544]:
master_df = topic7_similarity_cnn_fox_df
master_df = master_df.append(topic7_similarity_cnn_rt_df, ignore_index=True)
master_df = master_df.append(topic17_similarity_cnn_fox_df, ignore_index=True)
master_df = master_df.append(topic17_similarity_cnn_rt_df, ignore_index=True)

In [545]:
master_df = master_df.append(topic17_similarity_cnn_rt_df, ignore_index=True)

In [554]:
master_df.groupby(['topic', 'target_article_category']).describe()#.to_csv('topic7_17_cnn_rt_fox_sumstats.csv', index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity,cos_similarity
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
topic,target_article_category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
7,fox,11160.0,0.829557,0.053827,0.546681,0.799794,0.835679,0.867431,0.983914
7,rt,6789.0,0.839205,0.048338,0.622709,0.811862,0.844044,0.872818,0.970934
17,fox,3848.0,0.883838,0.053208,0.616143,0.859033,0.89344,0.919283,0.986398
17,rt,11856.0,0.880365,0.048125,0.603045,0.857095,0.889097,0.914101,0.9775


In [568]:
import plotly.io as pio
fox_df = master_df[master_df['target_article_category'] == 'fox']
rt_df = master_df[master_df['target_article_category'] == 'rt']
twosample_results_fox = scipy.stats.ttest_ind(fox_df[fox_df['topic']==7].cos_similarity,
                                              fox_df[fox_df['topic']==17].cos_similarity,
                                             equal_var=False)

twosample_results_rt = scipy.stats.ttest_ind(rt_df[rt_df['topic']==7].cos_similarity,
                                              rt_df[rt_df['topic']==17].cos_similarity,
                                             equal_var=False)
fox_mean_dif = 0.883838 - 0.829557
rt_mean_dif = 0.880365 - 0.839205

matrix_twosample = [
    ['', 'Mean Difference', 'Test Statistic', 'p-value'],
    ['CNN vs Fox News', fox_mean_dif, twosample_results_fox[0], twosample_results_fox[1]],
    ['CNN vs RT', rt_mean_dif, twosample_results_rt[0], twosample_results_rt[1]]
]

twosample_table = FF.create_table(matrix_twosample, index=True)
plotly.offline.iplot(twosample_table, filename='twosample-table')
#pio.write_image(plotly.offline.iplot(twosample_table, filename='twosample-table'), 'images/fig1.png')
print(twosample_results_fox)
print(twosample_results_rt)


plotly.tools.FigureFactory.create_table is deprecated. Use plotly.figure_factory.create_table



Ttest_indResult(statistic=-54.40850503534507, pvalue=0.0)
Ttest_indResult(statistic=-56.03708842489034, pvalue=0.0)


In [555]:
twosample_results_fox = scipy.stats.ttest_ind(fox_topic7_15_df[fox_topic7_15_df['topic'] == 7].cos_similarity,
                                              fox_topic7_15_df[fox_topic7_15_df['topic'] == 15].cos_similarity,
                                              equal_var=False)

(0.9675233364105225, 2.802596928649634e-44)



p-value may not be accurate for N > 5000.



In [333]:
sources_to_cat = {'cnn': 0, 'fox': 1, 'rt': 2}

In [338]:
master_df['target_article_category_code'] = pd.Categorical(master_df.target_article_category).codes

In [362]:
fox_topic7_15_df = master_df[master_df['target_article_category'] == 'fox']
fox_topic7_15_df.head()

Unnamed: 0,cnn_article,target_article,target_article_category,topic,cos_similarity,target_article_category_code
0,230,439,fox,7,0.875211,0
1,230,441,fox,7,0.949309,0
2,230,451,fox,7,0.858029,0
3,230,467,fox,7,0.86677,0
4,230,480,fox,7,0.839578,0


In [380]:
import scipy
import plotly.plotly as py
import plotly
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
from scipy.stats import shapiro

plotly.offline.init_notebook_mode(connected=True)
print(shapiro(fox_topic7_15_df[fox_topic7_15_df['topic'] == 7].cos_similarity))

twosample_results_fox = scipy.stats.ttest_ind(fox_topic7_15_df[fox_topic7_15_df['topic'] == 7].cos_similarity,
                                              fox_topic7_15_df[fox_topic7_15_df['topic'] == 15].cos_similarity,
                                              equal_var=False)
matrix_twosample = [
    ['', 'Test Statistic', 'p-value'],
    ['Sample Data', twosample_results_fox[0], twosample_results_fox[1]]
]

twosample_table = FF.create_table(matrix_twosample, index=True)
plotly.offline.iplot(twosample_table, filename='twosample-table')

(0.9675233364105225, 2.802596928649634e-44)



p-value may not be accurate for N > 5000.


plotly.tools.FigureFactory.create_table is deprecated. Use plotly.figure_factory.create_table



In [379]:
twosample_results_fox

Ttest_indResult(statistic=12.752880048403087, pvalue=1.1026747934686911e-33)

In [323]:
import statsmodels.api as sm
#from statsmodels.api import add_constant

In [340]:
X = master_df[['target_article_category_code', 'topic']]
y = master_df['cos_similarity']

In [342]:
X

Unnamed: 0,const,target_article_category_code,topic
0,1.0,0,7
1,1.0,0,7
2,1.0,0,7
3,1.0,0,7
4,1.0,0,7
5,1.0,0,7
6,1.0,0,7
7,1.0,0,7
8,1.0,0,7
9,1.0,0,7


In [344]:
X = sm.add_constant(X)
est = sm.OLS(y, X.astype(float)).fit()

In [346]:
est.summary()

0,1,2,3
Dep. Variable:,cos_similarity,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.021
Method:,Least Squares,F-statistic:,198.8
Date:,"Mon, 11 Feb 2019",Prob (F-statistic):,3.82e-86
Time:,01:02:01,Log-Likelihood:,28871.0
No. Observations:,18777,AIC:,-57740.0
Df Residuals:,18774,BIC:,-57710.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8527,0.002,477.103,0.000,0.849,0.856
target_article_category_code,0.0101,0.001,12.806,0.000,0.009,0.012
topic,-0.0033,0.000,-14.398,0.000,-0.004,-0.003

0,1,2,3
Omnibus:,1718.526,Durbin-Watson:,1.372
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2471.063
Skew:,-0.728,Prob(JB):,0.0
Kurtosis:,4.02,Cond. No.,36.3


In [311]:
master_df

Unnamed: 0,cnn_article,target_article,target_article_category,topic,cos_similarity
0,230,439,fox,7,0.875211
1,230,441,fox,7,0.949309
2,230,451,fox,7,0.858029
3,230,467,fox,7,0.866770
4,230,480,fox,7,0.839578
5,230,482,fox,7,0.841156
6,230,484,fox,7,0.819665
7,230,485,fox,7,0.866371
8,230,486,fox,7,0.817849
9,230,492,fox,7,0.860888


In [304]:
topic7_similarity_cnn_fox_df

Unnamed: 0,cnn_article,target_article,target_article_category,topic,cos_similarity
0,230,439,fox,7,0.875211
1,230,441,fox,7,0.949309
2,230,451,fox,7,0.858029
3,230,467,fox,7,0.866770
4,230,480,fox,7,0.839578
5,230,482,fox,7,0.841156
6,230,484,fox,7,0.819665
7,230,485,fox,7,0.866371
8,230,486,fox,7,0.817849
9,230,492,fox,7,0.860888


In [300]:
%%time
topic7_similarity_cnn_rt_df = pd.DataFrame(columns=['cnn_article', 'target_article', 'target_article_category', 'cos_similarity'])
for cnn_idx in cnn_topic7_idx:
    for rt_idx in rt_topic7_idx:
        row = {'cnn_article': cnn_idx,
               'target_article': rt_idx,
               'target_article_category': 'rt',
               'cos_similarity': cos_sim_mat[cnn_idx, rt_idx]}
        topic7_similarity_cnn_rt_df = topic7_similarity_cnn_rt_df.append(row, ignore_index=True)

CPU times: user 29.2 s, sys: 103 ms, total: 29.3 s
Wall time: 29.4 s


In [302]:
topic7_similarity_cnn_rt_df

Unnamed: 0,cnn_article,target_article,target_article_category,cos_similarity
0,230,571,rt,0.792108
1,230,575,rt,0.842052
2,230,578,rt,0.851453
3,230,592,rt,0.871155
4,230,598,rt,0.892496
5,230,607,rt,0.878043
6,230,624,rt,0.883142
7,230,626,rt,0.903254
8,230,632,rt,0.866110
9,230,634,rt,0.880412


In [296]:
topic7_similarity_cnn_rt_df

Unnamed: 0,cnn_article,target_article,target_article_category,cos_similarity
0,230,571,rt,0.792108
1,230,575,rt,0.842052
2,230,578,rt,0.851453
3,230,592,rt,0.871155
4,230,598,rt,0.892496
5,230,607,rt,0.878043
6,230,624,rt,0.883142
7,230,626,rt,0.903254
8,230,632,rt,0.866110
9,230,634,rt,0.880412


In [290]:
topic7_similarity_cnn_fox_rt_df = topic7_similarity_cnn_fox_df.append(topic7_similarity_cnn_rt_df)

In [293]:
topic7_similarity_cnn_fox_rt_df

Unnamed: 0,cnn_article,target_article,target_article_category,cos_similarity
0,230,439,fox,0.875211
1,230,441,fox,0.949309
2,230,451,fox,0.858029
3,230,467,fox,0.866770
4,230,480,fox,0.839578
5,230,482,fox,0.841156
6,230,484,fox,0.819665
7,230,485,fox,0.866371
8,230,486,fox,0.817849
9,230,492,fox,0.860888


In [264]:
topic7_cossim_df = pd.DataFrame(data=cnn_topic7_idx, columns=['cnn_article'])
topic7_cossim_df.set_index('cnn_article', inplace=True)

In [265]:
topic7_cossim_df

230
245
258
287
305
320
344
359
402
405
414
