In [1]:
# enable automatic reloading of the notebook
%load_ext autoreload
%autoreload 2

# News Clustering

This notebook contains the scripts to create news article clusters.

## Data Loading

In [2]:
# import the dataset loader
from src.data.dataset import load_dataset, DATA_PATHS

In [3]:
# load the raw articles
dataset = load_dataset(DATA_PATHS["processed"])

### Dataset Statistics

In [4]:
# check how many articles were loaded
len(dataset)

1750325

In [5]:
dataset[0]

{'uri': '1395062594',
 'lang': 'eng',
 'isDuplicate': False,
 'date': datetime.datetime(2020, 1, 1, 0, 0),
 'time': '00:01:00',
 'dateTime': datetime.datetime(2020, 1, 1, 0, 1),
 'dateTimePub': datetime.datetime(2020, 1, 1, 0, 1),
 'dataType': 'news',
 'sim': 0,
 'url': 'https://www.bostonglobe.com/sports/2019/12/31/yearinsports/6YOUwYiMDlIqNQSKl2V13M/story.html',
 'title': "A look back at Boston's year in sports - The Boston Globe",
 'body': "The Bruins exceeded expectations and came tantalizingly close to capturing a second title for the city, rolling all the way to the Stanley Cup Final before succumbing to the St. Louis Blues in seven games. Meanwhile, the Red Sox and Celtics underachieved, submitting disappointing performances in seasons in which both were expected to compete for a title.\n\nThere were notable figures who left town, whether it was a franchise tight end choosing to call it a career or a pair of All-Stars departing via free agency to compete for division rivals.\n\n

## Experiments

### Prepare News Articles

In [6]:
from src.utils.NewsArticle import NewsArticle

In [7]:
articles = [NewsArticle(article) for article in dataset]

In [8]:
articles[0]

NewsArticle(
  title=A look back at Boston's year in sports - The Boston Globe,
  lang=eng,
  source=The Boston Globe,
  time=1577833260.0,
  url=https://www.bostonglobe.com/sports/2019/12/31/yearinsports/6YOUwYiMDlIqNQSKl2V13M/story.html,
)

In [9]:
articles[4]

NewsArticle(
  title=Hot Button: Dorsey out as Browns GM ... Rhule denies interest ... Browns to interview Roman ... Gase won't ring Bell,
  lang=eng,
  source=DKPittsburghSports.com,
  time=1577833320.0,
  url=https://www.dkpittsburghsports.com/2019/12/31/new-hot-button/,
)

### Prepare News Event Monitor

In [10]:
from src.utils.NewsEventMonitor import NewsEventMonitor

In [14]:
sim_threshold = 0.5
time_threshold_in_days = 3
time_compare_stat = "min"

In [15]:
event_monitor = NewsEventMonitor(
    sim_threshold=sim_threshold, 
    time_threshold_in_days=time_threshold_in_days, 
    time_compare_stat=time_compare_stat
)

### Cluster News Articles

In [16]:
from tqdm.notebook import tqdm

In [17]:
for article in tqdm(articles[:1000], desc="Article Feed"):
    event_monitor.update(article)

Article Feed:   0%|          | 0/1000 [00:00<?, ?it/s]

In [18]:
events = event_monitor.get_events()

In [22]:
events[3].articles

[NewsArticle(
   title=LeBron James es elegido el mejor deportista de la década,
   lang=spa,
   source=Líbero Perú,
   time=1577833320.0,
   url=https://libero.pe/deportes/basquet/1528528-lebron-james-basquetbolista-nombrado-mejor-deportista-decada-ap-nba,
 ),
 NewsArticle(
   title=LeBron James' HS Rival Reveals Lakers' Anthony Davis Is Tim Duncan's Clone,
   lang=eng,
   source=Heavy,
   time=1577833980.0,
   url=https://heavy.com/sports/2019/12/tim-duncan-anthony-davis/,
 ),
 NewsArticle(
   title=LeBron disputes eighth grade yearbook's "Most Athletic" choice,
   lang=eng,
   source=ESPN.com,
   time=1577836380.0,
   url=https://www.espn.com/nba/story/_/id/28401837/despite-yearbook-claim-lebron-says-was-most-athletic-eighth-grader-class,
 ),
 NewsArticle(
   title=George just misses triple-double, Clippers beat Kings 105-87,
   lang=eng,
   source=ABS-CBN News,
   time=1577836740.0,
   url=http://sports.abs-cbn.com/nba/news/2020/01/01/george-just-misses-triple-double-clippers-beat-