In [1]:
# enable automatic reloading of the notebook
%load_ext autoreload
%autoreload 2

In [2]:
import ast
import pandas as pd
from tqdm.notebook import tqdm

from src.utils.NewsEventBase import NewsEventBase
from src.utils.NewsArticle import NewsArticle

In [3]:
# update the paths
INPUT_DIR_NAME = "../data/processed/manual_join/"
OUTPUT_DIR_NAME = "../data/final/"

In [4]:
def literal_converter(val):
    try:
        return ast.literal_eval(val)
    except Exception:
        return val

In [5]:
def create_events(df):
    clusterIds = df["clusterId"].unique()

    events = [
        NewsEventBase(
            articles=[
                NewsArticle(a)
                for a in df[df["clusterId"] == clusterId].to_dict("records")
            ]
        )
        for clusterId in tqdm(clusterIds, desc="Events")
    ]
    events = sorted(events, key=lambda e: e.min_time)
    return events

In [6]:
def load_events(filename):
    df = pd.read_csv(
        INPUT_DIR_NAME + filename,
        names=[
            "title",
            "body",
            "lang",
            "source",
            "dateTime",
            "url",
            "uri",
            "eventUri",
            "concepts",
            "clusterId",
        ],
        dtype={
            "title": "str",
            "body": "str",
            "lang": "str",
            "source": "str",
            "dateTime": "str",
            "url": "str",
            "uri": "str",
            "eventUri": "str",
            "concepts": "string",
            "clusterId": "str",
        },
        parse_dates=["dateTime"],
        on_bad_lines="warn",
        engine="python",
        skiprows=1,
    )
    # dataframe cleanup
    df = df.where(df.notnull() & df.notna(), None)
    df["id"] = [i for i in range(len(df))]

    # dataframe sorting and init
    df = df.sort_values(by="dateTime")
    events = create_events(df)
    return df, events

In [7]:
def print_events_min(event, article):
    print(f"{event.cluster_id:<9}  {str(article.event_id):<12}: {article.uri}  {article.lang}  {article.get_time()} - {article.title}")

In [8]:
def print_events_max(event, article):
    print(f"URI:     {article.uri}")
    print(f"LANG:    {article.lang}")
    print(f"WN_ID:   {event.cluster_id}")
    print(f"ER_ID:   {str(article.event_id)}")
    print(f"TIME:    {article.get_time()}")
    print(f"TITLE:   {article.title}")
    print(f"CONTENT: {article.body}\n")

In [9]:
def print_events(events, func=print_events_min, min_articles=1, max_articles=None):
    n_articles = 1
    for event in events:
        if min_articles > len(event.articles) and (max_articles == None or len(event.articles) >= max_articles):
            continue

        print('-------------------------------------------------------------------')
        for article in event.articles:
            print(f"{n_articles:<6}", end=":  ")
            func(event, article)
            n_articles += 1

In [10]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {

    },
    "art_to_cls": {

    },
    "remove_cls": [

    ],
    "remove_art": [

    ]
}

In [11]:
def do_manual_corrections(df, manual_corr):
    if "cls_to_cls" in manual_corr:
        for cls_id_from, cls_id_to in manual_corr["cls_to_cls"].items():
            df.loc[df["clusterId"] == cls_id_from, "clusterId"] = cls_id_to
    if "art_to_cls" in manual_corr:
        for art_uri, cls_id in manual_corr["art_to_cls"].items():
            df.loc[df["uri"] == art_uri, "clusterId"] = cls_id
    if "remove_cls" in manual_corr:
        for cls_id in manual_corr["remove_cls"]:
            df = df[df["clusterId"] != cls_id]
    if "remove_art" in manual_corr:
        for art_uri in manual_corr["remove_art"]:
            df = df[df["uri"] != art_uri]

    return df

In [12]:
def prepare_sample_df(df, reviewed_cls, manual_corr):
    sample_df = df[df["clusterId"].isin(reviewed_cls)]
    sample_df = do_manual_corrections(sample_df, manual_corr)
    cluster_mapping = {
        key: f"wn-{idx+1}" for idx, key in enumerate(sample_df["clusterId"].unique())
    }
    sample_df["clusterId"] = sample_df["clusterId"].map(cluster_mapping)
    return sample_df

In [13]:
def save_df_to_csv(df, filename):
    df["id"] =  [i for i in range(len(df))]
    df.to_csv(OUTPUT_DIR_NAME + filename, encoding="utf-8", index=False)

In [14]:
def save_df_to_json(df, filename):
    df["id"] =  [i for i in range(len(df))]
    df.to_json(OUTPUT_DIR_NAME + filename, orient="records", lines=True)

# Dataset Evaluation

In [15]:
FILE_NAME = "og2021.csv" # TODO: change this

In [16]:
df, events = load_events(FILE_NAME)

Events:   0%|          | 0/1456 [00:00<?, ?it/s]

## Manual Annotation

In [17]:
print_events(events)

-------------------------------------------------------------------
1     :  wn-1059    None        : 6628239427  eng  2021-07-01 08:01:00 - Iran's Basketball Team Held At Airport Over 'Unauthorized Food Items'
2     :  wn-1059    None        : 6630896381  eng  2021-07-03 00:43:00 - Iran Olympic basketball team detained at Tehran airport over 'unauthorised food items'
-------------------------------------------------------------------
3     :  wn-1060    eng-6899867 : 6628362266  eng  2021-07-01 10:51:00 - Olympics latest: Samoa participation in doubt over COVID, reports say
4     :  wn-1060    eng-6899867 : 6628527643  eng  2021-07-01 14:00:00 - Olympics latest: Samoa weightlifters to withdraw over COVID
-------------------------------------------------------------------
5     :  wn-1061    eng-6906275 : 6628364007  eng  2021-07-01 10:54:00 - What Do the Olympics Have Against Women?
6     :  wn-1061    None        : 6632726558  spa  2021-07-04 17:30:00 - Las atletas en Japón enfrentan

In [18]:
df[df["uri"] == "6678899976"].to_dict("records")

[{'title': 'Barada: Dosegli smo bistveno več, kot bi glede na vse lahko pričakovali',
  'body': 'Ti dosežki, posebej v manj izpostavljenih športih, odmevajo tudi v Tokiu. Podpredsednik Olimpijskega komiteja Slovenije Tomaž Barada ocenjuje, da dosežki presegajo realne zmožnosti. "Glede na to koliko nas je v primerjavi z velesilami, posebej pa na vložek, je to bistveno več, kot lahko pričakujemo. Tudi država se mora vprašati kaj lahko dodamo, da obdržimo in morda še razvijemo ta naš šport," še pravi Barada. Ta je državne organe pohvalil za hitro ukrepanje v številnih primerih, tudi pri pridobivanju državljanstva košarkarja Mika Tobeyja, sam pa si želi še nekaj večjo finančno podporo, ki bi olajšala trdo delo atletom in trenerjem. Tudi država je v težkem položaju, saj je širina športa na južni strani Alp izjemna: "Tudi v velikih državah se velikokrat sprašujejo, kako je možno, da imamo toliko uspešnih športov. Raznolikost nas dela neznanko. Dejstvo pa je, da se nikoli nismo odločili za zi

In [19]:
# ended at row: 9632

In [20]:
reviewed_cls = df["clusterId"].unique().tolist()

potential_clean_cls = []

potential_irelevant_cls = []

In [21]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-513" : "wn-1060",
        "wn-1"   : "wn-204",
        "wn-1102": "wn-1095",
        "wn-1104": "wn-1091",
        "wn-1109": "wn-1098",
        "wn-1101": "wn-1116",
        "wn-1115": "wn-1116",
        "wn-1119": "wn-1122",
        "wn-1123": "wn-1122",
        "wn-1126": "wn-1122",
        "wn-528" : "wn-530",
        "wn-1141": "wn-1098",
        "wn-441" : "wn-1157",
        "wn-545" : "wn-1157",
        "wn-1163": "wn-1157",
        "wn-1166": "wn-1157",
        "wn-1167": "wn-1063",
        "wn-268" : "wn-16",
        "wn-314" : "wn-16",
        "wn-1179": "wn-1177",
        "wn-19"  : "wn-18",
        "wn-1182": "wn-1180",
        "wn-1183": "wn-1180",
        "wn-215" : "wn-204",
        "wn-21"  : "wn-578",
        "wn-589" : "wn-587",
        "wn-595" : "wn-568",
        "wn-572" : "wn-217",
        "wn-209" : "wn-217",
        "wn-1210": "wn-568",
        "wn-25"  : "wn-475",
        "wn-1228": "wn-1227",
        "wn-1233": "wn-475",
        "wn-621" : "wn-1239",
        "wn-1241": "wn-1240",
        "wn-633" : "wn-1248",
        "wn-640" : "wn-639",
        "wn-641" : "wn-641",
        "wn-40"  : "wn-39",
        "wn-1258": "wn-1262",
        "wn-214" : "wn-1264",
        "wn-237" : "wn-1266",
        "wn-240" : "wn-1266",
        "wn-1270": "wn-1266",
        "wn-668" : "wn-1266",
        "wn-669" : "wn-1266",
        "wn-49"  : "wn-670",
        "wn-487" : "wn-1266",
        "wn-1272": "wn-1248",
        "wn-250" : "wn-249",
        "wn-52"  : "wn-1266",
        "wn-674" : "wn-1266",
        "wn-55"  : "wn-1266",
        "wn-490" : "wn-249",
        "wn-1278": "wn-1266",
        "wn-491" : "wn-1266",
        "wn-76"  : "wn-1266",
        "wn-695" : "wn-1283",
        "wn-1288": "wn-688",
        "wn-681" : "wn-688",
        "wn-82"  : "wn-711",
        "wn-745" : "wn-742",
        "wn-270" : "wn-767",
        "wn-271" : "wn-259",
        "wn-277" : "wn-1023",
        "wn-88"  : "wn-1024",
        "wn-772" : "wn-287",
        "wn-791" : "wn-63",

        "wn-301" : "wn-306",
        "wn-308" : "wn-802",
        "wn-826" : "wn-829",

        "wn-848" : "wn-1026",

        "wn-499" : "wn-819",
        "wn-117" : "wn-63",
        "wn-325" : "wn-249",
        "wn-328" : "wn-851",
        "wn-124" : "wn-858",
        "wn-860" : "wn-858",
        "wn-1033": "wn-858",
        "wn-864" : "wn-858",
        "wn-125" : "wn-858",
        "wn-127" : "wn-858",
        "wn-867" : "wn-858",
        "wn-869" : "wn-824",
        "wn-147" : "wn-868",
        "wn-339" : "wn-868",
        "wn-345" : "wn-868",

        "wn-1340": "wn-1339",
        "wn-1341": "wn-1339",
        "wn-1349": "wn-1346",

        "wn-349" : "wn-352",
        "wn-358" : "wn-352",
        "wn-359" : "wn-352",
        "wn-363" : "wn-364",
        "wn-367" : "wn-370",
        "wn-385" : "wn-368",
        "wn-372" : "wn-368",

        "wn-1422": "wn-1420",
        "wn-1432": "wn-1431",
        "wn-405" : "wn-404",
        "wn-1443": "wm-1441",
        "wn-1445": "wn-1441",

        "wn-1451": "wn-1452",
        "wn-1056": "wn-428",
        "wn-458" : "wn-428",
        "wn-1455": "wn-428",
        "wn-1456": "wn-428",
        "wn-434" : "wn-428",
        "wn-461" : "wn-428",
        "wn-459" : "wn-431",

    },
    "art_to_cls": {
        "6644630144": "wn-1178",
        "6648723965": "wn-XXX1",
        "6650206349": "wn-XXX1",
        "6650686713": "wn-XXX1",
        "6651284082": "wn-475",
        "6651305240": "wn-475",
        "6651873796": "wn-475",
        "6661353207": "wn-XXX2",
        "6660463617": "wn-734",
        "6660684160": "wn-734",
        "6660699781": "wn-734",
        "6660705467": "wn-734",
        "6661033544": "wn-734",
        "6661044857": "wn-734",
        "6661046134": "wn-734",
        "6661210401": "wn-734",
        "6661291413": "wn-734",
        "6661300255": "wn-734",
        "6661308772": "wn-734",
        "6661678018": "wn-734",
        "6661794220": "wn-734",
        "6661816290": "wn-734",
        "6660435146": "wn-731",
        "6660481602": "wn-731",
        "6660459128": "wn-731",
        "6660481602": "wn-731",
        "6660510197": "wn-731",
        "6660547145": "wn-XXX3",
        "6660781643": "wn-XXX3",
        "6660832079": "wn-XXX3",
        "6664583590": "wn-XXX4",
        "6665155682": "wn-XXX4",

        "6660897083": "wn-742",
        "6661050555": "wn-742",
        "6661050539": "wn-742",
        "6661061749": "wn-742",
        "6661502769": "wn-750",

        "6660955098": "wn-XXX5",
        "6660955098": "wn-XXX5",
        "6660955098": "wn-XXX5",

        "6661051273": "wn-802",
        "6662261123": "wn-802",
        "6662281244": "wn-802",
        "6662369659": "wn-802",
        "6662376422": "wn-802",
        "6662448491": "wn-802",
        "6663978591": "wn-802",
        "6664695980": "wn-802",
        "6663268092": "wn-802",
        "6663415277": "wn-802",

        "6661192849": "wn-269",
        "6661293672": "wn-269",
        "6661300787": "wn-269",

        "6661163551": "wn-287",

        # Rayssa Leal
        "6661767931": "wn-306",
        "6662244597": "wn-306",
        "6662258597": "wn-306",
        "6662287005": "wn-306",
        "6662296431": "wn-306",
        "6662301490": "wn-306",
        "6662468791": "wn-306",
        "6662476519": "wn-306",
        "6662530775": "wn-306",
        "6662778668": "wn-306",

        # Nishiya
        "6662864431": "wn-305",
        "6662616058": "wn-305",
        "6663100693": "wn-305",
        "6662324287": "wn-305",
        "6662402645": "wn-305",
        "6662418193": "wn-305",
        "6662438255": "wn-305",
        "6662533411": "wn-305",
        "6662557660": "wn-305",

        "6664237400": "wn-305",

        "6680016038" : "wn-431",
        "6680242045" : "wn-431",
        "6680256800" : "wn-431",
        "6680257569" : "wn-431",

    },
    "remove_cls": [
        "wn-1062", # Tokio 2020: Conoce el calendario completo de competencias de los Juegos Olímpicos, disciplina por disciplina
        "wn-1081", # How to Watch the Tokyo 2021 Olympics
        "wn-578",  # The delayed Olympic Games FINALLY get underway in less than a week

    ],
    "remove_art": [
        "6654645749",
        "6662074692",
        "6661502769",
        "6663134088",
        "6661959618",
        "6664362646",
        "6664380015",
        "6664287622",
        "6664418096",

    ]
}

## True Labels (Manual) Preparation

In [22]:
true_df = prepare_sample_df(df, reviewed_cls, manual_corr)

In [23]:
true_df = true_df[["id", "title", "body", "lang", "source", "dateTime", "url", "uri", "eventUri", "concepts", "clusterId"]]
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
0,0,Iran's Basketball Team Held At Airport Over 'U...,"Incident comes weeks ahead of Tokyo Olympics, ...",eng,Haberler.com,2021-07-01 06:01:00,https://en.haberler.com/iran-s-basketball-team...,6628239427,,"['olympic_games', 'japan', 'basketball']",wn-1
1,1,Olympics latest: Samoa participation in doubt ...,TOKYO -- The July 23 opening ceremony of the d...,eng,Nikkei Asia,2021-07-01 08:51:00,https://asia.nikkei.com/Spotlight/Tokyo-2020-O...,6628362266,eng-6899867,"['olympic_games', 'japan', 'basketball']",wn-2
2,2,What Do the Olympics Have Against Women?,"Yes, I am talking about how Laurel Hubbard, a ...",eng,Townhall,2021-07-01 08:54:00,https://townhall.com/tipsheet/rebeccadowns/202...,6628364007,eng-6906275,"['olympic_games', 'japan', 'basketball']",wn-3
4,4,Breastfeeding Mother Granted Permission To Bri...,Breastfeeding mothers competing at the Olympic...,eng,www.sportbible.com,2021-07-01 10:10:00,https://www.sportbible.com/australia/news-brea...,6628423216,eng-6906275,"['olympic_games', 'japan', 'basketball']",wn-4
5,5,Sports News | Breastfeeding Olympians Allowed ...,"Canadian basketball player Kim Gaucher, who is...",eng,LatestLY,2021-07-01 10:20:00,https://www.latestly.com/agency-news/sports-ne...,6628431262,eng-6906275,"['olympic_games', 'japan', 'basketball']",wn-4
...,...,...,...,...,...,...,...,...,...,...,...
10960,10960,Juegos Olímpicos ganaron aceptación entre much...,"Antes, muchos japoneses mostraron su reticenci...",spa,www.diariolibre.com,2021-08-13 18:36:00,https://www.diariolibre.com/deportes/olimpismo...,6686899243,spa-2454564,"['olympic_games', 'japan', 'judo']",wn-1347
10961,10961,"Japón, al final, se dejó 'seducir' por los Jue...","Por ahora, hay en muchos japoneses un sentimie...",spa,El Financiero,2021-08-13 21:09:00,https://www.elfinanciero.com.mx/tokio-2020/202...,6687117130,spa-2454564,"['olympic_games', 'japan', 'judo']",wn-1347
10962,10962,Películas esenciales por si te clavaste con el...,La brasileña Rayssa Leal se robó el corazón de...,spa,Chilango,2021-08-13 23:08:00,https://www.chilango.com/cine-y-tv/peliculas-e...,6687247369,,"['olympic_games', 'japan', 'skateboarding']",wn-1350
10963,10963,El alcalde japonés que le tocó ofrecer disculp...,"Por una escena transmitida por televisión, que...",spa,Récord,2021-08-14 02:47:00,https://record.acento.com.do/polideportivo/el-...,6687444123,spa-2455119,"['olympic_games', 'japan', 'judo']",wn-1349


## Save Evaluation Results

In [24]:
save_df_to_csv(true_df, FILE_NAME)