In [62]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook, tqdm
import matplotlib.pyplot as plt

import spacy
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
nlp = spacy.load('en_core_web_sm')

In [4]:
from newsapi import NewsApiClient
from datetime import datetime as dt
from datetime import timedelta

In [5]:
key = 'f392d617107f4dbfa6a672410064e7d1'
newsapi = NewsApiClient(api_key=key)

In [6]:
def get_past_articles(past):
    past_articles = dict()
    for past_days in range(1, past):
        from_day = str(dt.now() - timedelta(days=past_days))
        to_day = str(dt.now() - timedelta(days=past_days -1))
        past_articles.update({from_day:to_day})
    return past_articles

def get_artciles(query, past=30):
    past_articles = get_past_articles(past)
    all_articles = []
    for i, j in tqdm_notebook(past_articles.items()):
        for pag in tqdm_notebook(range(1,6)):
            pag_articles = newsapi.get_everything(q=query,
                                                 language='en',
                                                 from_param=i,
                                                 to=j,
                                                 sort_by='relevancy',
                                                 page=pag)['articles']
            if len(pag_articles) == 0: break
            all_articles.extend(pag_articles)
    return all_articles

In [48]:
start_date = '2020-01-04'

In [41]:
end_date = dt.strptime(start_date, '%Y-%m-%d') + timedelta(days=1)
end_date.strftime("%Y-%m-%d") 

'2020-01-05'

In [81]:
all_articles = []

for i in tqdm(range(0,30)):
    from_date = dt.strptime(start_date, '%Y-%m-%d') + timedelta(days=i)
    end_date = from_date + timedelta(days=1)
    
    from_date = from_date.strftime("%Y-%m-%d") 
    end_date = end_date.strftime("%Y-%m-%d") 
    
    for p in range(1, 6):
        articles = newsapi.get_everything(
                                              sources='bbc-news,the-verge',
                                              domains='bbc.co.uk,techcrunch.com',
                                              from_param=from_date,
                                              to=end_date,
                                              language='en',
                                              sort_by='relevancy',
                                              page=p)['articles']
        all_articles.extend(articles)

100%|██████████| 30/30 [00:16<00:00,  1.84it/s]


In [82]:
df_articles = pd.DataFrame.from_dict(all_articles)
df_articles.shape

(3000, 8)

In [83]:
df_articles = df_articles.drop_duplicates(subset='content')

In [87]:
df_articles.to_csv('../data/news_api.csv', index=False)

# 1. Load Data

In [None]:
df = pd.read_csv('../data/nyt_data_20200202.csv')

In [None]:
df['pub_date'] = pd.to_datetime(df['pub_date'])

df = df.drop_duplicates()

In [None]:
df.head()

In [None]:
df.shape

# 2. Text Analysis

To give meaning to independent words and, consequently, whole sentences, we’ll use SpaCy’s pre-trained word embeddings models. More specifically, SpaCy’s large model (en_core_web_lg), which has pre-trained word vectors for 685k English words. Alternatively, you could be using any pre-trained word representation model (Word2Vec, FastText, GloVe…).

By default, SpaCy considers a sentence’s vector as the average between every word’s vector. It’s a simplistic approach that doesn’t take into account the order of words to determine a sentence’s vector.

In [None]:
sent_vecs = {}
docs = []

for headline in tqdm_notebook(df['headline']):
    doc = nlp(headline)
    docs.append(doc)
    sent_vecs.update({'headline': doc.vector})

sentences = list(sent_vecs.keys())
vectors = list(sent_vecs.values())

# 3. Clustering

The epsilon parameter determines the maximum distance between two samples for them to be considered as in the same neighborhood, meaning that if eps is too big, fewer clusters will be formed, but also if it’s too small, most of the points will be classified as not belonging to a cluster (-1), which will result in a few clusters as well.

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

In [None]:
x = np.array(vectors)

n_classes = {}

for i in tqdm_notebook(np.arange(0.001, 1, 0.001)):
    dbscan = DBSCAN(eps=i,
                    min_samples=1,
                    metric='cosine').fit(x)
    n_classes.update({i: len(pd.Series(dbscan.labels_).value_counts())})

In [None]:
plt.plot(n_classes.keys(), n_classes.values())
plt.show()

Tunning eps value might be one of the most delicate steps because the outcome will vary a lot depending on how much you want to consider sentences as similar. The right value will come up with experimentation, trying to find a value that preserves the similarities between sentences without splitting close sentences into different groups.

In general, since we want to end up with very similar sentences in the same cluster, the target should be a value that returns a higher number of classes

In [None]:
dbscan = DBSCAN(eps=0.08,
                min_samples=2,
                metric='cosine').fit(x)
dbscan

In [None]:
result = pd.DataFrame({'label': dbscan.labels_,
                       'sent': sentences})
example_result = result[result['label'] == 1].sent.tolist()
event_df = df[df['headline'].isin(example_result)][['pub_date', 'headline']]
event_df['pub_date'] = pd.to_datetime(event_df['pub_date'])
event_df = event_df.sort_values(by='pub_date').dropna()

# 4. Transform to Events

Next step is to arrange those sentences in time and to filter them by relevance. 

Since there are many titles about the same topic every day, we need a criterium to pick one among them. It should be the sentence that best represents the event, one that comprises the core message which those titles refer to.

In order to achieve that, we can group the daily sentences, and for each group (or cluster), choose the one closest to the cluster center.

In [None]:
def get_mean_vector(sents):
    a = np.zeros(300)
    for sent in sents:
        a = a+nlp(sent).vector
    return a/len(sents)


def get_central_vector(sents):
    vecs = []
    for sent in sents:
        doc = nlp(title)
        vecs.append(doc.vector)
    mean_vec = get_mean_vector(sents)
    index = pairwise_distances_argmin_min(np.array([mean_vec]),
                                          vecs)[0][0]
    return sents[index]