In [1]:
# enable automatic reloading of the notebook
%load_ext autoreload
%autoreload 2

In [2]:
import ast
import torch
import pandas as pd

from src.utils.NewsEvent import NewsEvent
from src.utils.NewsArticle import NewsArticle

In [3]:
DIR_NAME = "../data/final/multi/"

## Data loading methods


In [4]:
def literal_converter(val):
    try:
        return ast.literal_eval(val)
    except Exception:
        return val

In [5]:
def load_events(input_file):
    df = pd.read_csv(
        DIR_NAME + input_file,
        names=[
            "id",
            "title",
            "body",
            "lang",
            "source",
            "dateTime",
            "url",
            "uri",
            "eventUri",
            "concepts",
            "clusterId",
            "namedEntities",
            "wikiConcepts",
        ],
        dtype={
            "id": "Int64",
            "title": "str",
            "body": "str",
            "lang": "str",
            "source": "str",
            "dateTime": "str",
            "url": "str",
            "uri": "str",
            "eventUri": "str",
            "concepts": "string",
            "clusterId": "str",
            "namedEntities": "str",
            "wikiConcepts": "str",
        },
        parse_dates=["dateTime"],
        on_bad_lines="warn",
        engine="python",
        skiprows=1,
    )
    # dataframe cleanup
    df = df.drop(columns=["wikiConcepts", "namedEntities"])
    df = df[df["title"].notna() & df["title"].notnull()]
    df = df.where(df.notnull() & df.notna(), None)
    # dataframe sorting and init
    df = df.sort_values(by="dateTime")
    df["id"] = [i for i in range(len(df))]

    clusterIds = df["clusterId"].unique()

    events = [
        NewsEvent(
            articles=[
                NewsArticle(a)
                for a in df[df["clusterId"] == clusterId].to_dict("records")
            ],
            use_ne=False,
        )
        for clusterId in clusterIds
    ]
    events = sorted(events, key=lambda e: e.time_interval["min"])

    return df, events

## Print methods


In [63]:
def print_events_min(event, article):
    print(
        f"{event.cluster_id:<9}  {str(article.event_id):<12} - {article.uri}  {article.lang}  {article.get_time()} - {article.title}"
    )

In [7]:
def print_events_max(event, article):
    print(f"URI:     {article.uri}")
    print(f"LANG:    {article.lang}")
    print(f"WN_ID:   {event.cluster_id}")
    print(f"ER_ID:   {str(article.event_id)}")
    print(f"TIME:    {article.get_time()}")
    print(f"TITLE:   {article.title}")
    print(f"CONTENT: {article.body}\n")

In [8]:
def print_events(events, func=print_events_min):
    for event in events:
        print("-------------------------------------------------------------------")
        for article in event.articles:
            func(event, article)

## Cluster-metrics


In [48]:
def get_iqr_outliers(event):
    intra_distances = event.get_intra_distances()
    if len(event.articles) < 2:
        return []

    dists, indices = torch.sort(
        torch.tensor(intra_distances["distances"]), descending=False
    )

    q1, q3 = torch.quantile(dists, q=torch.tensor([0.25, 0.75]), interpolation="linear")
    iqr = q3 - q1
    upper_fence = q3 + 1.5 * iqr
    lower_fence = q1 - 1.5 * iqr
    return [
        idx.item() for idx in indices if dists[idx] < lower_fence and dists[idx] > upper_fence
    ]

In [40]:
def get_z_scores(event):
    if len(event.articles) < 2:
        return []

    distances = torch.tensor(event.get_intra_distances()["distances"])
    std = torch.std(distances)
    mean = torch.mean(distances)
    z_scores = (distances - mean) / std
    return z_scores

def get_z_score_outliers(event, threshold = 3.0):
    z_scores = get_z_scores(event)
    return torch.where(torch.abs(z_scores) > threshold)[0].tolist()

# Kobe Bryant Helicopter Crash


In [11]:
df, events = load_events("kobe_bryant__helicopter.csv")

In [12]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
0,0,Chol Marial's Long Journey to College Basketball,The Long Journey of Maryland 7-Footer Chol Mar...,eng,Sports Illustrated,2020-01-03 17:21:00,https://www.si.com/college/2020/01/03/chol-mar...,1397607188,eng-5399258,"['kobe_bryant', 'helicopter']",wn-1
1,1,"Kobe Bryant Dead at 41, Killed in Helicopter C...",0comments Retired Los Angeles Lakers star Kobe...,eng,Celebrity,2020-01-26 21:38:00,https://popculture.com/sports/2020/01/26/kobe-...,1424279779,eng-5461522,"['kobe_bryant', 'helicopter']",wn-2
2,2,Kobe Bryant killed in helicopter crash in Cali...,NBA legend Kobe Bryant was killed in a helicop...,eng,FOX 5 Atlanta,2020-01-26 21:41:00,https://www.fox5atlanta.com/news/kobe-bryant-k...,1424281805,eng-5461522,"['kobe_bryant', 'helicopter']",wn-2
3,3,Kobe Bryant Dies In Helicopter Crash,TMZ has confirmed that basketball legend Kobe ...,eng,Zero Hedge,2020-01-26 21:42:00,https://www.zerohedge.com/economics/kobe-bryan...,1424282612,eng-5461522,"['kobe_bryant', 'helicopter']",wn-2
22,4,NBA Legend Kobe Bryant Killed in Helicopter Crash,Photo: Andrew D. Bernstein/NBAE via Getty Imag...,eng,Vulture,2020-01-26 21:43:00,https://www.vulture.com/2020/01/kobe-bryant-de...,1424283042,eng-5461522,"['kobe_bryant', 'helicopter']",wn-2
...,...,...,...,...,...,...,...,...,...,...,...
26476,26489,Vanessa Bryant Shares Photo Of Kobe Wearing T-...,Vanessa Bryant has shared a photograph of her ...,eng,UNILAD,2020-06-01 16:01:00,https://www.unilad.co.uk/news/vanessa-bryant-s...,6087869659,eng-5813746,"['kobe_bryant', 'helicopter']",wn-6208
26480,26490,Baloncesto: Vanessa Bryant se suma a las prote...,En medio de los disturbios en todo Estados Uni...,spa,Mundo Deportivo,2020-06-01 18:28:00,https://www.mundodeportivo.com/baloncesto/nba/...,6088030151,,"['kobe_bryant', 'helicopter']",wn-6208
26492,26491,The pandemic isn't the only thing that shook t...,(CNN) -- It's hard to remember a time when the...,eng,WHDH 7 Boston,2020-06-01 18:52:00,https://whdh.com/news/the-pandemic-isnt-the-on...,6088052507,,"['kobe_bryant', 'helicopter']",wn-6217
26493,26492,"#BlackLivesMatter: Michael Jordan, Liverpool y...",Fue a través de una foto en sus redes sociales...,spa,Sopitas.com,2020-06-01 20:26:00,https://www.sopitas.com/deportes/mensaje-liver...,6088150322,spa-1941208,"['kobe_bryant', 'helicopter']",wn-6218


## Find event outliers

In [None]:
event = events[1]

In [53]:
eng_event = NewsEvent(articles=[article for article in event.articles if article.lang == "eng"])

In [57]:


outlier_ids = get_iqr_outliers(eng_event)
print(outlier_ids)


for idx in outlier_ids:
    print_events_min(event, eng_event.articles[idx])

[]


In [64]:
outlier_ids = get_z_score_outliers(eng_event, threshold = 0.5)

for idx, article in enumerate(eng_event.articles):
    if idx not in outlier_ids:
        print_events_min(eng_event, article)


wn-2       eng-5461522  - 1424282612  eng  2020-01-26 22:42:00 - Kobe Bryant Dies In Helicopter Crash
wn-2       eng-5461522  - 1424285954  eng  2020-01-26 22:48:00 - Breaking News! Kobe Bryant DEAD After Helicopter Crash in Calabasas
wn-2       eng-5461522  - 1424286745  eng  2020-01-26 22:50:00 - Kobe Bryant Killed In Helicopter Crash
wn-2       eng-5461522  - 1424288011  eng  2020-01-26 22:51:00 - Report: Kobe Bryant Has Died In A Helicopter Accident - ROCK 92.9 Rocks
wn-2       eng-5461522  - 1424287840  eng  2020-01-26 22:51:00 - Kobe Bryant Dies In California Helicopter Crash * The Hollywood Unlocked
wn-2       eng-5461522  - 1424288314  eng  2020-01-26 22:51:00 - NBA News: Kobe Bryant Dies In Helicopter Crash | Get More Sports
wn-2       eng-5461522  - 1424290491  eng  2020-01-26 22:54:00 - KOBE BRYANT KILLED IN HELICOPTER CRASH
wn-2       eng-5461522  - 1424291117  eng  2020-01-26 22:55:00 - Kobe Bryant Cause of Death: How Did NBA Superstar Die?
wn-2       eng-5461522  - 142429