# News Headline Analysis

In this project we're analyzing news headlines produced by two journalists:

- A finance reporter from the Business Insider who covers market updates (Akin Oyedele)
- A celebrity reporter from the Huffington Post (Carly Ledbetter)

Our objective is to find similarity and differences between the ways through which these reporters structure their headlines.

In [1]:
from pattern.en import parsetree

In [2]:
s = parsetree('The cat sat on the mat.')
for sentence in s:
    for chunk in sentence.chunks:
        print chunk.type, [(w.string, w.type) for w in chunk.words]

NP [(u'The', u'DT'), (u'cat', u'NN')]
VP [(u'sat', u'VBD')]
PP [(u'on', u'IN')]
NP [(u'the', u'DT'), (u'mat', u'NN')]


![The cat sat on the mat](https://raw.githubusercontent.com/AYLIEN/headline_analysis/master/parsetree.png)

In [3]:
import cPickle as pickle
author1 = pickle.load( open( "author1.p", "rb" ) )
author1[0]

{u'title': u"One corner of the real-estate market might've peaked"}

In [4]:
for story in author1:
    story["title_length"] = len(story["title"])
    story["title_chunks"] = [chunk.type for chunk in parsetree(story["title"])[0].chunks]
    story["title_chunks_length"] = len(story["title_chunks"])

In [5]:
author1[0]

{u'title': u"One corner of the real-estate market might've peaked",
 'title_chunks': [u'NP', u'PP', u'NP', u'VP'],
 'title_chunks_length': 4,
 'title_length': 52}

In [6]:
import pandas as pd

df1 = pd.DataFrame.from_dict(author1)

In [7]:
df1.describe()

Unnamed: 0,title_chunks_length,title_length
count,700.0,700.0
mean,5.691429,57.73
std,3.762884,28.035283
min,1.0,9.0
25%,2.0,35.0
50%,5.0,53.0
75%,7.0,77.0
max,30.0,188.0


In [8]:
import difflib
v1 = author1[3]["title_chunks"]
v2 = author1[1]["title_chunks"]
sm=difflib.SequenceMatcher(None,v1,v2)
print v1, v2, sm.ratio()

[u'NP', u'NP', u'VP', u'NP', u'NP', u'VP', u'PP'] [u'NP', u'VP', u'NP', u'PP', u'NP', u'NP'] 0.615384615385


In [9]:
import numpy as np
chunks = [author["title_chunks"] for author in author1]
m = np.zeros((700,700))
for i, chunkx in enumerate(chunks):
    for j, chunky in enumerate(chunks):
        sm=difflib.SequenceMatcher(None,chunkx,chunky)
        m[i][j] = sm.ratio()

In [10]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)

In [11]:
tsne = tsne_model.fit_transform(m)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 700 / 700
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 83 iterations with early exaggeration: 13.379313
[t-SNE] Error after 144 iterations: 0.633875


In [12]:
from sklearn.cluster import MiniBatchKMeans

kmeans_model = MiniBatchKMeans(n_clusters=5, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(m)
kmeans_clusters = kmeans.predict(m)
kmeans_distances = kmeans.transform(m)

In [13]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

output_notebook()
plot_author1 = bp.figure(plot_width=900, plot_height=700, title="Author1",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_author1.scatter(x=tsne[:,0], y=tsne[:,1],
                    color=colormap[kmeans_clusters],
                    source=bp.ColumnDataSource({
                        "chunks": [x["title_chunks"] for x in author1], 
                        "title": [x["title"] for x in author1],
                        "cluster": kmeans_clusters
                    }))

hover = plot_author1.select(dict(type=HoverTool))
hover.tooltips={"chunks": "@chunks (title: \"@title\")", "cluster": "@cluster"}
show(plot_author1)

In [14]:
author2 = pickle.load( open( "author2.p", "rb" ) )
for story in author2:
    story["title_length"] = len(story["title"])
    story["title_chunks"] = [chunk.type for chunk in parsetree(story["title"])[0].chunks]
    story["title_chunks_length"] = len(story["title_chunks"])

In [15]:
pd.DataFrame.from_dict(author2).describe()

Unnamed: 0,title_chunks_length,title_length
count,700.0,700.0
mean,5.452857,62.532857
std,1.896252,9.996154
min,1.0,35.0
25%,4.0,57.0
50%,5.0,62.0
75%,7.0,68.0
max,13.0,96.0


In [16]:
chunks_joint = [author["title_chunks"] for author in (author1+author2)]
print len(chunks_joint)
m_joint = np.zeros((1400,1400))
for i, chunkx in enumerate(chunks):
    for j, chunky in enumerate(chunks):
        sm=difflib.SequenceMatcher(None,chunkx,chunky)
        m_joint[i][j] = sm.ratio()

1400


In [17]:
set1= [author["title_chunks"] for author in author1]
set2= [author["title_chunks"] for author in author2]
list_new = [itm for itm in set1 if itm in set2]
len(list_new)

347

In [18]:
tsne_joint = tsne_model.fit_transform(m_joint)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1400
[t-SNE] Computed conditional probabilities for sample 1400 / 1400
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 83 iterations with early exaggeration: 12.344255
[t-SNE] Error after 139 iterations: 0.789386


In [19]:
plot_joint = bp.figure(plot_width=900, plot_height=700, title="Author1 vs. Author2",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_joint.scatter(x=tsne_joint[:,0], y=tsne_joint[:,1],
                    color=colormap[([0] * 700 + [1] * 700)],
                    source=bp.ColumnDataSource({
                        "chunks": [x["title_chunks"] for x in author1] + [x["title_chunks"] for x in author2], 
                        "title": [x["title"] for x in author1] + [x["title"] for x in author2]
                    }))

hover = plot_joint.select(dict(type=HoverTool))
hover.tooltips={"chunks": "@chunks (title: \"@title\")"}
show(plot_joint)