In [1]:
# enable automatic reloading of the notebook
%load_ext autoreload
%autoreload 2

In [2]:
import ast
import pandas as pd

from src.utils.NewsEventBase import NewsEventBase
from src.utils.NewsArticle import NewsArticle

In [3]:
INPUT_DIR_NAME = "../data/processed/multi_2days/"
OUTPUT_DIR_NAME = "../data/final/"

In [4]:
def literal_converter(val):
    try:
        return ast.literal_eval(val)
    except Exception:
        return val

In [5]:
def create_events(df):
    clusterIds = df["clusterId"].unique()

    events = [
        NewsEventBase(articles = [NewsArticle(a) for a in df[df["clusterId"] == clusterId].to_dict("records")])
        for clusterId in clusterIds
    ]
    events = sorted(events, key=lambda e: e.min_time)
    return events

In [6]:
def load_events(input_file, ignore_uris: list = None):
    df = pd.read_csv(
        INPUT_DIR_NAME + input_file,
        names=[
            "id",
            "title",
            "body",
            "lang",
            "source",
            "dateTime",
            "url",
            "uri",
            "eventUri",
            "concepts",
            "clusterId",
            "namedEntities",
            "wikiConcepts",
        ],
        dtype={
            "id": "Int64",
            "title": "str",
            "body": "str",
            "lang": "str",
            "source": "str",
            "dateTime": "str",
            "url": "str",
            "uri": "str",
            "eventUri": "str",
            "concepts": "string",
            "clusterId": "str",
            "namedEntities": "str",
            "wikiConcepts": "str",
        },
        parse_dates=["dateTime"],
        on_bad_lines="warn",
        engine="python",
        skiprows=1,
    )
    # dataframe cleanup
    df = df.drop(columns=["wikiConcepts", "namedEntities"])
    df = df[df["title"].notna() & df["title"].notnull()]
    df = df.where(df.notnull() & df.notna(), None)
    # dataframe sorting and init
    df = df.sort_values(by="dateTime")

    if ignore_uris and len(ignore_uris) > 0:
        df = df[~df["uri"].isin(ignore_uris)]

    df["id"] = [i for i in range(len(df))]

    events = create_events(df)

    return df, events

In [7]:
def print_events_min(event, article):
    print(f"{event.cluster_id:<9}  {str(article.event_id):<12}: {article.uri}  {article.lang}  {article.get_time()} - {article.title}")

In [8]:
def print_events_max(event, article):
    print(f"URI:     {article.uri}")
    print(f"LANG:    {article.lang}")
    print(f"WN_ID:   {event.cluster_id}")
    print(f"ER_ID:   {str(article.event_id)}")
    print(f"TIME:    {article.get_time()}")
    print(f"TITLE:   {article.title}")
    print(f"CONTENT: {article.body}\n")

In [9]:
def print_events(events, func=print_events_min, min_articles=1, max_articles=None):
    n_articles = 1
    for event in events:
        if min_articles > len(event.articles) and (max_articles == None or len(event.articles) >= max_articles):
            continue

        print('-------------------------------------------------------------------')
        for article in event.articles:
            print(f"{n_articles:<6}", end=":  ")
            func(event, article)
            n_articles += 1

In [10]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {

    },
    "art_to_cls": {

    },
    "remove_cls": [

    ],
    "remove_art": [

    ]
}

In [11]:
def do_manual_corrections(df, manual_corr):
    if "cls_to_cls" in manual_corr:
        for cls_id_from, cls_id_to in manual_corr["cls_to_cls"].items():
            df.loc[df["clusterId"] == cls_id_from, "clusterId"] = cls_id_to
    if "art_to_cls" in manual_corr:
        for art_uri, cls_id in manual_corr["art_to_cls"].items():
            df.loc[df["uri"] == art_uri, "clusterId"] = cls_id
    if "remove_cls" in manual_corr:
        for cls_id in manual_corr["remove_cls"]:
            df = df[df["clusterId"] != cls_id]
    if "remove_art" in manual_corr:
        for art_uri in manual_corr["remove_art"]:
            df = df[df["uri"] != art_uri]

    return df

In [12]:
def prepare_predicts(true_df, pred_df, true_cls, pred_cls):
    true_cls_ids = { p["uri"]: p[true_cls] for p in true_df.to_dict("records") }
    pred_cls_ids = { p["uri"]: p[pred_cls] for p in pred_df.to_dict("records") }
    return [{ "true_id": true_cls_ids[key], "pred_id": pred_cls_ids[key] } for key in true_cls_ids.keys()]

In [13]:
def prepare_sample_df(df, selected_cls, manual_corr):
    sample_df = df[df["clusterId"].isin(selected_cls)]
    sample_df = do_manual_corrections(sample_df, manual_corr)
    cluster_mapping = {
        key: f"wn-{idx+1}" for idx, key in enumerate(sample_df["clusterId"].unique())
    }
    sample_df["clusterId"] = sample_df["clusterId"].map(cluster_mapping)
    return sample_df

In [14]:
def measure_performance(articles):
    """Measures the performance of the clustering algorithm"""

    # get the following statistics
    # tp - number of correctly clustered-together article pairs
    # fp - number of incorrectly clustered-together article pairs
    # fn - number of incorrectly not-clustered-together article pairs
    # tn - number of correctly not-clustered-together article pairs
    tp, fp, fn, tn = 0, 0, 0, 0
    for i, ai in enumerate(articles):
        for aj in articles[i + 1 :]:
            if ai["true_id"] == aj["true_id"] and ai["pred_id"] == aj["pred_id"]:
                tp += 1
            elif ai["true_id"] != aj["true_id"] and ai["pred_id"] == aj["pred_id"]:
                fp += 1
            elif ai["true_id"] == aj["true_id"] and ai["pred_id"] != aj["pred_id"]:
                fn += 1
            else:
                tn += 1

    # get the precision, recall and F1 scores
    P = tp / (tp + fp)
    R = tp / (tp + fn)
    F1 = 2 * (P * R) / (P + R)
    A = (tp + tn) / (tp + fp + fn + tn)

    # return the metrics
    return {"F1": F1, "P": P, "R": R, "A": A}

In [15]:
def save_df_to_csv(df, filename):
    tmp_df = df.rename(columns={"dateTime": "date_time", "clusterId": "cluster_id"})
    tmp_df["id"] = tmp_df["uri"]
    tmp_df= tmp_df.drop(["uri", "eventUri"], axis=1)
    tmp_df.to_csv(OUTPUT_DIR_NAME + filename, encoding="utf-8", index=False)

In [16]:
processed_uris = set()

# Olympic Games - Japan - Basketball

In [17]:
FILE_NAME = "olympic_games__japan__basketball.csv"

In [18]:
df, events = load_events(FILE_NAME, ignore_uris=processed_uris)

## Manual Annotation

In [19]:
print_events(events)

-------------------------------------------------------------------
1     :  wn-4       None        : 6628239427  eng  2021-07-01 06:01:00 - Iran's Basketball Team Held At Airport Over 'Unauthorized Food Items'
2     :  wn-4       None        : 6630896381  eng  2021-07-02 22:43:00 - Iran Olympic basketball team detained at Tehran airport over 'unauthorised food items'
-------------------------------------------------------------------
3     :  wn-3       eng-6899867 : 6628362266  eng  2021-07-01 08:51:00 - Olympics latest: Samoa participation in doubt over COVID, reports say
4     :  wn-3       eng-6899867 : 6628527643  eng  2021-07-01 12:00:00 - Olympics latest: Samoa weightlifters to withdraw over COVID
-------------------------------------------------------------------
5     :  wn-2       eng-6906275 : 6628364007  eng  2021-07-01 08:54:00 - What Do the Olympics Have Against Women?
-------------------------------------------------------------------
6     :  wn-1       None        : 6

In [20]:
df[df["uri"] == "6678889104"].to_dict("records")

[{'id': 8399,
  'title': 'Barada: Dosegli smo bistveno več, kot bi glede na vse lahko pričakovali (intervju)',
  'body': 'Tokio, 07. avgusta (STA) - Slovenija je vnovič hit olimpijskih iger. Še enkrat več so slovenski športniki potrdili neverjetno širino v številu disciplin, v katerih so nevarni. Ti dosežki, posebej v manj izpostavljenih športih, odmevajo tudi v Tokiu. Podpredsednik Olimpijskega komiteja ...',
  'lang': 'slv',
  'source': 'STA d.o.o.',
  'dateTime': Timestamp('2021-08-07 10:30:00'),
  'url': 'http://www.sta.si/http.php?id=2931082',
  'uri': '6678889104',
  'eventUri': None,
  'concepts': "['olympic_games', 'japan', 'basketball']",
  'clusterId': 'wn-3456'}]

In [21]:
# ended at row: completedd
selected_cls = [
    "wn-1",
    "wn-2",
    "wn-3",
    "wn-4",
    "wn-5",
    "wn-6",
    "wn-7",
    "wn-8",
    "wn-9",
    "wn-10",
    "wn-11",
    "wn-12",
    "wn-13",
    "wn-14",
    "wn-15",
    "wn-16",
    "wn-17",
    "wn-18",
    "wn-19",
    "wn-20",
    "wn-21",
    "wn-22",
    "wn-23",
    "wn-24",
    "wn-25",
    "wn-26",
    "wn-27",
    "wn-28",
    "wn-29",
    "wn-30",
    "wn-31",
    "wn-32",
    "wn-33",
    "wn-34",
    "wn-35",
    "wn-36",
    "wn-37",
    "wn-38",
    "wn-39",
    "wn-40",
    "wn-41",
    "wn-42",
    "wn-43",
    "wn-44",
    "wn-45",
    "wn-46",
    "wn-47",
    "wn-48",
    "wn-49",
    "wn-50",
    "wn-51",
    "wn-52",
    "wn-53",
    "wn-54",
    "wn-55",
    "wn-56",
    "wn-57",
    "wn-58",
    "wn-59",
    "wn-60",
    "wn-61",
    "wn-62",
    "wn-63",
    "wn-64",
    "wn-65",
    "wn-66",
    "wn-67",
    "wn-68",
    "wn-69",
    "wn-70",
    "wn-71",
    "wn-72",
    "wn-73",
    "wn-74",
    "wn-75",
    "wn-76",
    "wn-77",
    "wn-78",
    "wn-79",
    "wn-80",
    "wn-81",
    "wn-82",
    "wn-83",
    "wn-84",
    "wn-85",
    "wn-86",
    "wn-87",
    "wn-88",
    "wn-89",
    "wn-90",
    "wn-91",
    "wn-92",
    "wn-93",
    "wn-94",
    "wn-95",
    "wn-96",
    "wn-97",
    "wn-98",
    "wn-99",
    "wn-100",
    "wn-101",
    "wn-102",
    "wn-103",
    "wn-104",
    "wn-105",
    "wn-106",
    "wn-107",
    "wn-108",
    "wn-109",
    "wn-110",
    "wn-111",
    "wn-112",
    "wn-113",
    "wn-114",
    "wn-115",
    "wn-116",
    "wn-117",
    "wn-118",
    "wn-119",
    "wn-120",
    "wn-121",
    "wn-122",
    "wn-123",
    "wn-124",
    "wn-125",
    "wn-126",
    "wn-127",
    "wn-128",
    "wn-129",
    "wn-130",
    "wn-131",
    "wn-132",
    "wn-133",
    "wn-134",
    "wn-135",
    "wn-136",
    "wn-137",
    "wn-138",
    "wn-139",
    "wn-140",
    "wn-141",
    "wn-142",
    "wn-143",
    "wn-144",
    "wn-145",
    "wn-146",
    "wn-147",
    "wn-148",
    "wn-149",
    "wn-150",
    "wn-151",
    "wn-152",
    "wn-153",
    "wn-154",
    "wn-155",
    "wn-156",
    "wn-157",
    "wn-158",
    "wn-159",
    "wn-160",
    "wn-161",
    "wn-162",
    "wn-163",
    "wn-164",
    "wn-165",
    "wn-166",
    "wn-167",
    "wn-168",
    "wn-169",
    "wn-170",
    "wn-171",
    "wn-172",
    "wn-173",
    "wn-174",
    "wn-175",
    "wn-176",
    "wn-177",
    "wn-178",
    "wn-179",
    "wn-180",
    "wn-181",
    "wn-182",
    "wn-183",
    "wn-184",
    "wn-185",
    "wn-186",
    "wn-187",
    "wn-188",
    "wn-189",
    "wn-190",
    "wn-191",
    "wn-192",
    "wn-193",
    "wn-194",
    "wn-195",
    "wn-196",
    "wn-197",
    "wn-198",
    "wn-199",
    "wn-200",
    "wn-201",
    "wn-202",
    "wn-203",
    "wn-204",
    "wn-205",
    "wn-206",
    "wn-207",
    "wn-208",
    "wn-209",
    "wn-210",
    "wn-211",
    "wn-212",
    "wn-213",
    "wn-214",
    "wn-215",
    "wn-216",
    "wn-217",
    "wn-218",
    "wn-219",
    "wn-220",
    "wn-221",
    "wn-222",
    "wn-223",
    "wn-224",
    "wn-225",
    "wn-226",
    "wn-227",
    "wn-228",
    "wn-229",
    "wn-230",
    "wn-231",
    "wn-232",
    "wn-233",
    "wn-234",
    "wn-235",
    "wn-236",
    "wn-237",
    "wn-238",
    "wn-239",
    "wn-240",
    "wn-241",
    "wn-242",
    "wn-243",
    "wn-244",
    "wn-245",
    "wn-246",
    "wn-247",
    "wn-248",
    "wn-249",
    "wn-250",
    "wn-251",
    "wn-252",
    "wn-253",
    "wn-254",
    "wn-255",
    "wn-256",
    "wn-257",
    "wn-258",
    "wn-259",
    "wn-260",
    "wn-262",
    "wn-270",
    "wn-269",
    "wn-273",
    "wn-279",
    "wn-276",
    "wn-281",
    "wn-294",
    "wn-300",
    "wn-299",
    "wn-315",
    "wn-314",
    "wn-319",
    "wn-330",
    "wn-344",
    "wn-362",
    "wn-361",
    "wn-368",
    "wn-384",
    "wn-391",
    "wn-394",
    "wn-398",
    "wn-409",
    "wn-436",
    "wn-445",
    "wn-444",
    "wn-447",
    "wn-449",
    "wn-459",
    "wn-471",
    "wn-476",
    "wn-483",
    "wn-490",
    "wn-491",
    "wn-498",
    "wn-503",
    "wn-510",
    "wn-513",
    "wn-516",
    "wn-517",
    "wn-521",
    "wn-522",
    "wn-523",
    "wn-524",
    "wn-538",
    "wn-537",
    "wn-542",
    "wn-548",
    "wn-546",
    "wn-564",
    "wn-570",
    "wn-573",
    "wn-578",
    "wn-583",
    "wn-584",
    "wn-595",
    "wn-592",
    "wn-596",
    "wn-594",
    "wn-597",
    "wn-599",
    "wn-600",
    "wn-623",
    "wn-629",
    "wn-626",
    "wn-634",
    "wn-630",
    "wn-635",
    "wn-636",
    "wn-638",
    "wn-637",
    "wn-640",
    "wn-643",
    "wn-649",
    "wn-659",
    "wn-669",
    "wn-676",
    "wn-677",
    "wn-685",
    "wn-687",
    "wn-697",
    "wn-701",
    "wn-726",
    "wn-738",
    "wn-745",
    "wn-743",
    "wn-748",
    "wn-751",
    "wn-758",
    "wn-775",
    "wn-776",
    "wn-778",
    "wn-780",
    "wn-782",
    "wn-781",
    "wn-788",
    "wn-795",
    "wn-797",
    "wn-800",
    "wn-815",
    "wn-831",
    "wn-833",
    "wn-847",
    "wn-843",
    "wn-853",
    "wn-856",
    "wn-884",
    "wn-892",
    "wn-914",
    "wn-921",
    "wn-937",
    "wn-958",
    "wn-964",
    "wn-976",
    "wn-981",
    "wn-891",
    "wn-994",
    "wn-1001",
    "wn-1004",
    "wn-1006",
    "wn-1014",
    "wn-1018",
    "wn-1033",
    "wn-1035",
    "wn-1060",
    "wn-1083",
    "wn-1099",
    "wn-1101",
    "wn-1110",
    "wn-1122",
    "wn-1127",
    "wn-1146",
    "wn-1173",
    "wn-1172",
    "wn-1196",
    "wn-1199",
    "wn-1203",
    "wn-1191",
    "wn-1229",
    "wn-896",
    "wn-1252",
    "wn-1251",
    "wn-1262",
    "wn-1273",
    "wn-1298",
    "wn-1296",
    "wn-1306",
    "wn-1314",
    "wn-1312",
    "wn-1350",
    "wn-1349",
    "wn-1370",
    "wn-1394",
    "wn-1412",
    "wn-1435",
    "wn-1465",
    "wn-1492",
    "wn-1494",
    "wn-1510",
    "wn-1512",
    "wn-1517",
    "wn-1518",
    "wn-1523",
    "wn-1554",
    "wn-1555",
    "wn-1557",
    "wn-1563",
    "wn-1629",
    "wn-1656",
    "wn-1658",
    "wn-1669",
    "wn-1670",
    "wn-1682",
    "wn-1686",
    "wn-1685",
    "wn-1688",
    "wn-2167",
    "wn-1703",
    "wn-1706",
    "wn-1716",
    "wn-1740",
    "wn-1738",
    "wn-1825",
    "wn-1834",
    "wn-1841",
    "wn-1838",
    "wn-1847",
    "wn-1853",
    "wn-1851",
    "wn-1867",
    "wn-1879",
    "wn-1881",
    "wn-1886",
    "wn-1887",
    "wn-1892",
    "wn-1919",
    "wn-2030",
    "wn-2052",
    "wn-2053",
    "wn-2138",
    "wn-2139",
    "wn-2167",
    "wn-2174",
    "wn-2187",
    "wn-2190",
    "wn-2196",
    "wn-2200",
    "wn-2228",
    "wn-2290",
    "wn-2297",
    "wn-2341",
    "wn-2340",
    "wn-2345",
    "wn-2350",
    "wn-2353",
    "wn-2357",
    "wn-2356",
    "wn-2366",
    "wn-2377",
    "wn-2392",
    "wn-2396",
    "wn-2402",
    "wn-2403",
    "wn-2412",
    "wn-2404",
    "wn-2405",
    "wn-2407",
    "wn-2411",
    "wn-2418",
    "wn-2437",
    "wn-2439",
    "wn-2461",
    "wn-2464",
    "wn-2471",
    "wn-2470",
    "wn-2473",
    "wn-2481",
    "wn-2485",
    "wn-2509",
    "wn-2518",
    "wn-2530",
    "wn-2534",
    "wn-2544",
    "wn-2566",
    "wn-2576",
    "wn-2595",
    "wn-2599",
    "wn-2597",
    "wn-2610",
    "wn-2614",
    "wn-2616",
    "wn-2618",
    "wn-2626",
    "wn-2634",
    "wn-2638",
    "wn-2637",
    "wn-2646",
    "wn-2651",
    "wn-2629",
    "wn-2650",
    "wn-2676",
    "wn-2671",
    "wn-2666",
    "wn-2665",
    "wn-2685",
    "wn-2691",
    "wn-2696",
    "wn-2705",
    "wn-2715",
    "wn-2726",
    "wn-2728",
    "wn-2736",
    "wn-2738",
    "wn-2742",
    "wn-2741",
    "wn-2780",
    "wn-2800",
    "wn-2803",
    "wn-2811",
    "wn-2815",
    "wn-2826",
    "wn-2832",
    "wn-2842",
    "wn-2848",
    "wn-2851",
    "wn-2853",
    "wn-2912",
    "wn-2914",
    "wn-2913",
    "wn-2916",
    "wn-2929",
    "wn-2930",
    "wn-2939",
    "wn-2961",
    "wn-2956",
    "wn-2972",
    "wn-2924",
    "wn-2973",
    "wn-2980",
    "wn-3038",
    "wn-3043",
    "wn-3050",
    "wn-3051",
    "wn-3052",
    "wn-3067",
    "wn-3067",
    "wn-3066",
    "wn-3070",
    "wn-3077",
    "wn-3081",
    "wn-3087",
    "wn-3083",
    "wn-3098",
    "wn-3113",
    "wn-3119",
    "wn-3122",
    "wn-3132",
    "wn-3136",
    "wn-3171",
    "wn-3176",
    "wn-3181",
    "wn-3192",
    "wn-3211",
    "wn-3212",
    "wn-3253",
    "wn-3291",
    "wn-3315",
    "wn-3323",
    "wn-3327",
    "wn-3352",
    "wn-3399",
    "wn-3414",
    "wn-3433",
    "wn-3431",
    "wn-3429",
    "wn-3436", # izguba slovenije
    "wn-3435",
    "wn-3434",
    "wn-3441",
    "wn-3450",
    "wn-3464",
    "wn-3508",
    "wn-3516",
    "wn-3518",
    "wn-3522",
    "wn-3521",
    "wn-3532",
    "wn-3539",
    "wn-3551",
    "wn-3565",
    "wn-3579",
    "wn-3602",
    "wn-3604",
    "wn-3630",
    "wn-3628",
    "wn-3627",
    "wn-3691",
    "wn-3733",
]

In [22]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-3733": "wn-3465",
        "wn-3691": "wn-3465",
        "wn-3627": "wn-3465",
        "wn-3628": "wn-3465",
        "wn-3630": "wn-3465",
        "wn-3604": "wn-3465",
        "wn-3579": "wn-3565",
        "wn-3532": "wn-3431",
        "wn-3464": "wn-3431",
        "wn-3450": "wn-3431",
        "wn-3441": "wn-3431",
        "wn-3434": "wn-3431",
        "wn-3435": "wn-3431",
        "wn-3429": "wn-3431",
        "wn-3176": "wn-3192",
        "wn-3083": "wn-3067",
        "wn-3087": "wn-3084",
        "wn-3066": "wn-3038",
        "wn-3043": "wn-3038",
        "wn-2980": "wn-2916",
        "wn-2924": "wn-2916",
        "wn-2973": "wn-2916",
        "wn-2931": "wn-2930",
        "wn-2914": "wn-2912",
        "wn-2685": "wn-2665",
        "wn-2715": "wn-2650",
        "wn-2666": "wn-2650",
        "wn-10": "wn-5",
        "wn-13": "wn-5",
        "wn-17": "wn-15",
        "wn-23": "wn-22",
        "wn-37": "vn-1",
        "wn-56": "wn-30",
        "wn-84": "wn-5",
        "wn-85": "wn-86",
        "wn-95": "wn-91",
        "wn-96": "wn-2",
        "wn-93": "wn-86",
        "wn-103": "wn-87",
        "wn-106": "wn-87",
        "wn-104": "wn-105",
        "wn-109": "wn-105",
        "wn-108": "wn-87",
        "wn-116": "wn-105",
        "wn-120": "wn-115",
        "wn-374": "wn-118",
        "wn-127": "wn-122",
        "wn-125": "wn-87",
        "wn-137": "wn-87",
        "wn-152": "wn-50",
        "wn-159": "wn-86",
        "wn-163": "wn-91",
        "wn-164": "wn-50",
        "wn-220": "wn-213",
        "wn-226": "wn-228",
        "wn-254": "wn-228",
        "wn-276": "wn-5",
        "wn-281": "wn-228",
        "wn-299": "wn-300",
        "wn-314": "wn-315",
        "wn-394": "wn-391",
        "wn-444": "wn-408",
        "wn-455": "wn-408",
        "wn-491": "wn-228",
        "wn-498": "vn-228",
        "wn-517": "wn-516",
        "wn-522": "wn-521",
        "wn-523": "wn-521",
        "wn-524": "wn-521",
        "wn-542": "wn-521",
        "wn-548": "wn-219",
        "wn-573": "wn-546",
        "wn-578": "wn-546",
        "wn-583": "wn-270",
        "wn-586": "wn-546",
        "wn-594": "wn-595",
        "wn-600": "wn-599",
        "wn-598": "wn-521",
        "wn-619": "wn-521",
        "wn-629": "wn-623",
        "wn-634": "wn-626",
        "wn-637": "wn-623",
        "wn-643": "wn-640",
        "wn-677": "wn-676",
        "wn-743": "wn-738",
        "wn-776": "wn-629",
        "wn-702": "wn-629",
        "wn-815": "vn-788",
        "wn-847": "wn-831",
        "wn-856": "wn-831",
        "wn-884": "wn-843",
        "wn-1101": "wn-1018",
        "wn-1122": "wn-1078",
        "wn-1126": "wn-701",
        "wn-1191": "wn-896",
        "wn-1229": "wn-896",
        "wn-1273": "wn-1033",
        "wn-1291": "wn-1033",
        "wn-1306": "wn-896",
        "wn-1314": "wn-1033",
        "wn-1312": "wn-1033",
        "wn-1327": "wn-1296",
        "wn-1337": "wn-1033",
        "wn-1349": "wn-1146",
        "wn-1518": "wn-1512",
        "wn-1682": "wn-896",
        "wn-1724": "wn-1703",
        "wn-1716": "wn-1703",
        "wn-1740": "wn-1703",
        "wn-1834": "wn-1825",
        "wn-1833": "wn-1825",
        "wn-1841": "wn-1825",
        "wn-1838": "wn-1825",
        "wn-1847": "wn-1825",
        "wn-1853": "wn-1825",
        "wn-1871": "wn-1825",
        "wn-2139": "wn-2030",
        "wn-2174": "wn-2167",
        "wn-2190": "wn-2187",
        "wn-2200": "wn-2187",
        "wn-2297": "wn-2290",
        "wn-2340": "wn-2340",
        "wn-2345": "wn-2340",
        "wn-2353": "wn-2030",
        "wn-2411": "wn-2404",
        "wn-2437": "wn-2030",
        "wn-2437": "wn-XXXX31",
        "wn-2473": "wn-2470",
        "wn-2597": "wn-2595",
        "wn-2616": "wn-2599",
        "wn-2618": "wn-2614",
        "wn-2626": "wn-2614",
        "wn-2637": "wn-XXXX34",
        "wn-2651": "wn-2599",
        "wn-2629": "wn-2599",

    },
    "art_to_cls": {
        "6629128181": "wn-XXXXX1",
        "6631695662": "wn-XXXXX2",
        "6632104619": "wn-XXXXX3",
        "6632181777": "wn-XXXXX3",
        "6632232024": "wn-XXXXX3",
        "6633224354": "wn-XXXXX4",
        "6633248392": "wn-XXXXX4",
        "6632024196": "wn-XXXXX5",
        "6632312917": "wn-XXXXX6",
        "6632312917": "wn-XXXXX7",
        "6632129963": "wn-XXXXX6",
        "6632564411": "wn-83",
        "6632490295": "wn-XXXXX5",
        "6632564411": "wn-XXXXX5",
        "6632998148": "wn-XXXXX5",
        "6632824020": "wn-83",
        "6633504076": "wn-XXXXX9",
        "6633861957": "wn-87",
        "6633878361": "wn-87",
        "6633691802": "wn-XXXX10",
        "6633308395": "wn-XXXX10",
        "6633665977": "wn-XXXX11",
        "6634148954": "wn-XXXX11",
        "6634771085": "wn-XXXX11",
        "6634157754": "wn-114",
        "6638878121": "wn-228",
        "6646275307": "wn-XXXX12",
        "6646275599": "wn-XXXX12",
        "6646289653": "wn-XXXX12",
        "6646422814": "wn-XXXX12",
        "6651228796": "wn-XXXX13",
        "6651301100": "wn-XXXX13",
        "6651324383": "wn-XXXX13",
        "6656626869": "wn-XXXX14",
        "6656631217": "wn-XXXX14",
        "6656691528": "wn-XXXX14",
        "6656698098": "wn-XXXX14",
        "6657277206": "wn-XXXX14",
        "6657689885": "wn-XXXX14",
        "6657724691": "wn-XXXX14",
        "6657754352": "wn-XXXX14",
        "6657865089": "wn-XXXX14",
        "6657972472": "wn-XXXX14",
        "6657972472": "wn-XXXX14",
        "6660420741": "wn-XXXX15",
        "6661679820": "wn-XXXX16",
        "6662446200": "wn-XXXX17",
        "6660604159": "wn-XXXX18",
        "6660606967": "wn-XXXX18",
        "6660622292": "wn-XXXX18",
        "6661104484": "wn-XXXX19",
        "6661109402": "wn-XXXX19",
        "6661115499": "wn-XXXX20",
        "6662125287": "wn-XXXX21",
        "6662309918": "wn-XXXX21",
        "6662317981": "wn-XXXX21",
        "6663367518": "wn-XXXX22",
        "6661522799": "wn-XXXX23",
        "6663877699": "wn-XXXX24",
        "6662921540": "wn-1825",
        "6662822249": "wn-XXXX25",
        "6663063721": "wn-XXXX25",
        "6663176137": "wn-XXXX25",
        "6662845904": "wn-XXXX26",
        "6663149818": "wn-XXXX27",
        "6663199265": "wn-XXXX27",
        "6664082250": "wn-XXXX28",
        "6664295651": "wn-XXXX28",
        "6664385662": "wn-XXXX28",
        "6664473904": "wn-XXXX28",
        "6663869562": "wn-1825",
        "6663021022": "wn-1825",
        "6665387446": "wn-XXXX29",
        "6665655329": "wn-XXXX29",
        "6666071609": "wn-XXXX29",
        "6666361424": "wn-XXXX29",
        "6666838379": "wn-XXXX29",
        "6667212380": "wn-XXXX29",
        "6667261759": "wn-XXXX29",
        "6667265238": "wn-XXXX29",
        "6667568419": "wn-XXXX30",
        "6667629286": "wn-XXXX30",
        "6668621009": "wn-XXXX32",
        "6667340434": "wn-XXXX31",
        "6670202212": "wn-XXXX33",
        "6670205804": "wn-XXXX33",
        "6670420510": "wn-XXXX33",
        "6670454979": "wn-XXXX33",
        "6672403048": "wn-XXXX34",
        "6670232495": "wn-XXXX33",
        "6676113837": "wn-XXXX35",
        "6676281230": "wn-XXXX35",


    },
    "remove_cls": [
        "wn-9",
        "wn-14",
        "wn-26",
        "wn-29",
        "wn-31",
        "wn-36",
        "wn-38",
        "wn-40",
        "wn-41", # Sha'Carri Richardson
        "wn-46", # Sha'Carri Richardson
        "wn-44",
        "wn-49",
        "wn-48",
        "wn-47",
        "wn-51",
        "wn-53",
        "wn-57",
        "wn-58",
        "wn-59",
        "wn-61",
        "wn-60",
        "wn-62",
        "wn-63",
        "wn-66",
        "wn-65",
        "wn-68",
        "wn-71",
        "wn-73",
        "wn-77",
        "wn-78",
        "wn-75",
        "wn-80",
        "wn-81",
        "wn-88",
        "wn-90",
        "wn-89",
        "wn-92",
        "wn-99",
        "wn-100",
        "wn-102",
        "wn-107",
        "wn-110",
        "wn-113",
        "wn-117",
        "wn-121",
        "wn-123",
        "wn-124",
        "wn-131",
        "wn-130",
        "wn-129",
        "wn-142",
        "wn-140",
        "wn-144",
        "wn-145",
        "wn-146",
        "wn-150",
        "wn-149",
        "wn-147",
        "wn-153",
        "wn-154",
        "wn-158",
        "wn-157",
        "wn-156",
        "wn-155",
        "wn-166",
        "wn-165",
        "wn-169",
        "wn-168",
        "wn-172",
        "wn-170",
        "wn-173",
        "wn-176",
        "wn-175",
        "wn-174",
        "wn-177",
        "wn-178",
        "wn-179",
        "wn-181",
        "wn-180",
        "wn-184",
        "wn-183",
        "wn-182",
        "wn-186",
        "wn-188",
        "wn-189",
        "wn-191",
        "wn-190",
        "wn-192",
        "wn-193",
        "wn-195",
        "wn-196",
        "wn-202",
        "wn-200",
        "wn-199",
        "wn-198",
        "wn-204",
        "wn-205",
        "wn-207",
        "wn-206",
        "wn-214",
        "wn-212",
        "wn-210",
        "wn-216",
        "wn-218",
        "wn-221",
        "wn-222",
        "wn-223",
        "wn-225",
        "wn-224",
        "wn-227",
        "wn-238",
        "wn-237",
        "wn-236",
        "wn-235",
        "wn-233",
        "wn-230",
        "wn-229",
        "wn-240",
        "wn-242",
        "wn-241",
        "wn-243",
        "wn-247",
        "wn-246",
        "wn-245",
        "wn-257",
        "wn-256",
        "wn-255",
        "wn-253",
        "wn-252",
        "wn-251",
        "wn-250",
        "wn-249",
        "wn-248",
        "wn-259",
        "wn-261",
        "wn-260",
        "wn-263",

    ],
    "remove_art": [
        "6634153911",
        "6634471361",
        "6634690682",
        "6636867775",
        "6634368130",
        "6637904089",
        "6640085504",
        "6644309925",
        "6644480615",
        "6652471493",
        "6652356530",
        "6654521077",
        "6655289374",
        "6656221822",
        "6657973865",
        "6660149450",
        "6658594726",
        "6658820755",
        "6658770973",
        "6660412432",
        "6662409179",
        "6663988334",
        "6666198893",
        "6665985080",
        "6666303262",
        "6666659924",
        "6666678798",
        "6667265238",
        "6670651270",
        "6671100967",
        "6671119731",
        "6676134571",
        "6676559756",
        "6677307739",
        "6677038610",
        "6677402437",
        "6678836565",
        "6679237939",
    ]
}

## True Labels (Manual) Preparation

In [23]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [24]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
6,0,Iran's Basketball Team Held At Airport Over 'U...,"Incident comes weeks ahead of Tokyo Olympics, ...",eng,Haberler.com,2021-07-01 04:01:00,https://en.haberler.com/iran-s-basketball-team...,6628239427,,"['olympic_games', 'japan', 'basketball']",wn-1
4,1,Olympics latest: Samoa participation in doubt ...,TOKYO -- The July 23 opening ceremony of the d...,eng,Nikkei Asia,2021-07-01 06:51:00,https://asia.nikkei.com/Spotlight/Tokyo-2020-O...,6628362266,eng-6899867,"['olympic_games', 'japan', 'basketball']",wn-2
3,2,What Do the Olympics Have Against Women?,"Yes, I am talking about how Laurel Hubbard, a ...",eng,Townhall,2021-07-01 06:54:00,https://townhall.com/tipsheet/rebeccadowns/202...,6628364007,eng-6906275,"['olympic_games', 'japan', 'basketball']",wn-3
0,3,Tokio 2020: Conoce el calendario completo de c...,A pesar de la situación sanitaria y del aplaza...,spa,RPP noticias,2021-07-01 07:15:00,https://rpp.pe/multideportes/juegos-olimpicos/...,6628379911,,"['olympic_games', 'japan', 'basketball']",wn-4
8,4,Breastfeeding Mother Granted Permission To Bri...,Breastfeeding mothers competing at the Olympic...,eng,www.sportbible.com,2021-07-01 08:10:00,https://www.sportbible.com/australia/news-brea...,6628423216,eng-6906275,"['olympic_games', 'japan', 'basketball']",wn-5
...,...,...,...,...,...,...,...,...,...,...,...
8925,9703,Olympics over. USA wins most medals,"(Tokyo, Japan) -- International Olympic Commit...",eng,KABC-AM,2021-08-09 16:14:00,https://www.kabc.com/2021/08/09/olympics-over-...,6681285982,eng-7004958,"['olympic_games', 'japan', 'basketball']",wn-390
9559,9712,'Most challenging' Tokyo Olympics close declar...,Fireworks light up the sky over the Olympic St...,eng,Phnom Penh Post,2021-08-09 16:55:00,https://www.phnompenhpost.com/sport/most-chall...,6681335872,eng-7006449,"['olympic_games', 'japan', 'basketball']",wn-397
9136,9717,JO 2020: Les coups de cœur de nos envoyés spéc...,"Après quinze jours intenses de compétition, le...",fra,RMC SPORT,2021-08-09 17:08:00,https://rmcsport.bfmtv.com/jeux-olympiques/jo-...,6681351181,,"['olympic_games', 'japan', 'basketball']",wn-396
9562,9719,IOC President Declares Tokyo Olympics Closed :...,The Tokyo 2020 Games were declared closed by I...,eng,TV360 Nigeria,2021-08-09 17:41:00,https://www.tv360nigeria.com/ioc-president-dec...,6681393542,eng-7006449,"['olympic_games', 'japan', 'basketball']",wn-397


## Subset Evaluation

### Clustering Evaluation

In [25]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [26]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.7427464521834336,
 'P': 0.95110903404393,
 'R': 0.6092713706705969,
 'A': 0.9956491217407881}

#### Evaluation using Event Registry

In [27]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.04314721363540583,
 'P': 0.02382066024317174,
 'R': 0.22869841562269713,
 'A': 0.8954314947369021}

### Statistics

In [28]:
true_df["lang"].value_counts()

lang
eng    965
spa    776
por    294
fra    279
slv    216
rus    168
deu    164
ara     39
zho      2
Name: count, dtype: int64

In [29]:
true_df["clusterId"].value_counts()

clusterId
wn-208    126
wn-397    118
wn-380     82
wn-256     72
wn-386     67
         ... 
wn-108      1
wn-109      1
wn-111      1
wn-309      1
wn-200      1
Name: count, Length: 398, dtype: int64

## Save Evaluation Results

In [30]:
save_df_to_csv(true_df, FILE_NAME)

In [31]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Judo

In [32]:
FILE_NAME = "olympic_games__japan__judo.csv"

In [33]:
df, events = load_events(FILE_NAME, ignore_uris=processed_uris)

In [34]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
0,0,Владимир Путин встретился с российскими олимпи...,"Сегодня, 30 июня, Президент Российской Федерац...",rus,nkeu.foreignaffairs.co.nz,2021-07-01 05:53:00,https://nkeu.foreignaffairs.co.nz/2021/07/01/%...,6628320832,,"['olympic_games', 'japan', 'judo']",wn-1
10,1,Сборная Самоа будет представлена на Играх спор...,Ранее министр связи Афамасагу Рико Тупаи заяви...,rus,ТАСС,2021-07-01 08:47:00,https://tass.ru/sport/11794269,6628456790,,"['olympic_games', 'japan', 'judo']",wn-5
9,2,Los 'CONs' tenían hasta el 5 de julio para Ins...,"Panamá. 2 de Judo. 1Ciclista, 1 Boxeadora, 3 d...",spa,La Estrella de Panamá,2021-07-01 09:05:00,https://www.laestrella.com.pa/deportes/cocteld...,6628473299,,"['olympic_games', 'japan', 'judo']",wn-4
1,3,Дзюдоистов из Челябинской области включили в о...,Олимпийский комитет России презентовал сборную...,rus,- :,2021-07-01 09:43:00,https://argumenti.ru/sport/2021/07/728509,6628510371,,"['olympic_games', 'japan', 'judo']",wn-1
31,4,"Conoce más sobre el Karate, nueva disciplina p...",El Karate debutará como disciplina olímpica en...,spa,TV Azteca,2021-07-01 09:45:00,https://www.tvazteca.com/aztecadeportes/juegos...,6628512059,,"['olympic_games', 'japan', 'judo']",wn-10
...,...,...,...,...,...,...,...,...,...,...,...
10168,9933,Paratleta georgiano es expulsado de Tokyo 2020...,"El paratleta Zviad Gogotchuri, que fue medalla...",spa,Dia a Dia,2021-08-20 20:35:00,http://www.diaadia.com.pa/deportes/paratleta-g...,6696089826,,"['olympic_games', 'japan', 'judo']",wn-3567
10188,9934,Delegación cubana para Juegos Paralímpicos ya ...,"La Habana, 20 ago (RHC) La delegación atlética...",spa,Radio Habana Cuba,2021-08-20 20:37:00,https://www.radiohc.cu/noticias/deportes/26758...,6696092371,,"['olympic_games', 'japan', 'judo']",wn-3574
10169,9935,Paratleta georgiano es expulsado de Tokio 2020...,"El yudoca de 34 años, que debía competir en lo...",spa,www.diariolibre.com,2021-08-20 21:16:00,https://www.diariolibre.com/deportes/olimpismo...,6696134197,,"['olympic_games', 'japan', 'judo']",wn-3567
10183,9936,RTVE anuncia su programación y comentaristas p...,"-- Paloma del Río, homenajeada en 'Días de ver...",spa,vertele,2021-08-20 21:18:00,https://vertele.eldiario.es/noticias/jjoo-para...,6696135968,spa-2463699,"['olympic_games', 'japan', 'judo']",wn-3571


In [35]:
events = sorted(events, key=lambda e: e.avg_time)

In [36]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-6       None        : 6628699172  eng  2021-07-01 14:26:00 - International Olympic Committee is coming under pressure over the alleged torture and arrest of Iranian athletes
2     :  wn-6       None        : 6628732642  eng  2021-07-01 14:52:00 - Tokyo 2020: International Olympic Committee is coming under pressure over the alleged torture and arrest of Iranian athletes
-------------------------------------------------------------------
3     :  wn-17      rus-1053245 : 6629280217  rus  2021-07-01 21:11:00 - В Москве официально открылся экипировочный центр для олимпийцев
4     :  wn-17      None        : 6629370725  rus  2021-07-01 22:21:00 - В Москве открылся главный экипировочный центр для олимпийцев
-------------------------------------------------------------------
5     :  wn-2       None        : 6628729889  eng  2021-07-01 14:48:00 - Tokyo 2020: Osaka, Matsuyama and the Japan gold medal hopefuls
6    

Check the values of the specific record

In [37]:
df[df["uri"] == "6661291413"].to_dict("records")

[{'id': 3954,
  'title': 'Jubel in Japan über erstes Olympia-Gold',
  'body': 'Nach all den Problemen, Skandalen und Widerständen im Vorfeld der Olympischen Spiele in Tokio hat Gastgeber Japan nun auch Grund zur Freude. Die Titelseiten japanischer Sportzeitungen waren komplett gefüllt mit der Nachricht von der ersten Goldmedaille für Japan durch Judoka Naohisa Takato. Nach "Tränen von 5 Jahren" endlich die "Revanche für Bronze in Rio" titelte die Zeitung "Nikkan Sports" neben einem formatfüllenden Takato, wie er lächelnd die Faust zum Sieg ballt. "Ich kann im Moment wirklich nichts denken, aber ich bin sehr dankbar, dass die Olympischen Spiele in Tokio stattfinden konnten", sagte Japans Held mit Blick auf die gewaltigen Herausforderungen durch die andauernde Corona-Pandemie. Anruf von Suga Japans Regierungschef Yoshihide Suga rief Takato am Sonntag von seinem Amtssitz aus an und gratulierte ihm. Er habe gespürt, mit welcher Zähigkeit der Judoka zum Sieg strebte. "Ich denke, viele waren

In [38]:
# ended at row: completed
selected_cls = [
    "wn-3",
    "wn-5",
    "wn-6",
    "wn-17",
    "wn-28",
    "wn-37",
    "wn-16",
    "wn-39",
    "wn-66",
    "wn-65",
    "wn-72",
    "wn-73",
    "wn-78",
    "wn-94",
    "wn-84",
    "wn-97",
    "wn-100",
    "wn-121",
    "wn-134",
    "wn-131",
    "wn-138",
    "wn-139",
    "wn-127",
    "wn-137",
    "wn-152",
    "wn-162",
    "wn-161",
    "wn-177",
    "wn-178",
    "wn-185",
    "wn-173",
    "wn-200",
    "wn-198",
    "wn-209",
    "wn-199",
    "wn-216",
    "wn-213",
    "wn-219",
    "wn-218",
    "wn-221",
    "wn-226",
    "wn-235",
    "wn-233",
    "wn-258",
    "wn-269",
    "wn-275",
    "wn-279",
    "wn-288",
    "wn-266",
    "wn-287",
    "wn-298",
    "wn-308",
    "wn-316",
    "wn-325",
    "wn-301",
    "wn-334",
    "wn-322",
    "wn-351",
    "wn-361",
    "wn-350",
    "wn-367",
    "wn-315",
    "wn-372",
    "wn-375",
    "wn-384",
    "wn-344",
    "wn-323",
    "wn-352",
    "wn-355",
    "wn-310",
    "wn-398",
    "wn-396",
    "wn-366",
    "wn-403",
    "wn-347",
    "wn-408",
    "wn-410",
    "wn-409",
    "wn-388",
    "wn-412",
    "wn-418",
    "wn-433",
    "wn-430",
    "wn-455",
    "wn-411",
    "wn-457",
    "wn-469",
    "wn-449",
    "wn-464",
    "wn-481",
    "wn-479",
    "wn-431",
    "wn-476",
    "wn-478",
    "wn-497",
    "wn-495",
    "wn-505",
    "wn-480",
    "wn-436",
    "wn-512",
    "wn-506",
    "wn-499",
    "wn-510",
    "wn-462",
    "wn-530",
    "wn-548",
    "wn-563",
    "wn-551",
    "wn-528",
    "wn-576",
    "wn-625",
    "wn-649",
    "wn-602",
    "wn-663",
    "wn-691",
    "wn-629",
    "wn-643",
    "wn-666",
    "wn-714",
    "wn-703",
    "wn-730",
    "wn-660",
    "wn-697",
    "wn-681",
    "wn-735",
    "wn-696",
    "wn-783",
    "wn-782",
    "wn-808",
    "wn-798",
    "wn-770",
    "wn-802",
    "wn-746",
    "wn-787",
    "wn-758",
    "wn-803",
    "wn-780",
    "wn-829",
    "wn-857",
    "wn-858",
    "wn-839",
    "wn-850",
    "wn-711",
    "wn-883",
    "wn-844",
    "wn-779",
    "wn-893",
    "wn-836",
    "wn-929",
    "wn-908",
    "wn-843",
    "wn-927",
    "wn-897",
    "wn-918",
    "wn-953",
    "wn-875",
    "wn-863",
    "wn-967",
    "wn-869",
    "wn-862",
    "wn-866",
    "wn-954",
    "wn-904",
    "wn-916",
    "wn-775",
    "wn-894",
    "wn-812",
    "wn-965",
    "wn-872",
    "wn-837",
    "wn-1020",
    "wn-1068",
    "wn-919",
    "wn-909",
    "wn-1091",
    "wn-1039",
    "wn-969",
    "wn-1018",
    "wn-1099",
    "wn-1045",
    "wn-1116",
    "wn-1042",
    "wn-962",
    "wn-1097",
    "wn-972",
    "wn-1168",
    "wn-1111",
    "wn-1153",
    "wn-1193",
    "wn-1115",
    "wn-1066",
    "wn-1194",
    "wn-1181",
    "wn-1110",
    "wn-936",
    "wn-1169",
    "wn-1246",
    "wn-1211",
    "wn-1276",
    "wn-1256",
    "wn-1264",
    "wn-1285",
    "wn-1213",
    "wn-1291",
    "wn-907",
    "wn-1278",
    "wn-1136",
    "wn-1269",
    "wn-1309",
    "wn-1224",
    "wn-1182",
    "wn-1137",
    "wn-1325",
    "wn-1245",
    "wn-1262",
    "wn-1335",
    "wn-1352",
    "wn-1344",
    "wn-1308",
    "wn-1356",
    "wn-1243",
    "wn-1044",
    "wn-1363",
    "wn-1241",
    "wn-1358",
    "wn-1321",
    "wn-1368",
    "wn-1327",
    "wn-1318",
    "wn-1324",
    "wn-1382",
    "wn-1293",
    "wn-1315",
    "wn-1355",
    "wn-1280",
    "wn-1376",
    "wn-1282",
    "wn-1351",
    "wn-1405",
    "wn-1409",
    "wn-1412",
    "wn-1407",
    "wn-1306",
    "wn-1279",
    "wn-1239",
    "wn-1208",
    "wn-1448",
    "wn-1453",
    "wn-1186",
    "wn-1472",
    "wn-1466",
    "wn-1236",
    "wn-1437",
    "wn-1437",
    "wn-1497",
    "wn-1496",
    "wn-1518",
    "wn-1517",
    "wn-1470",
    "wn-1510",
    "wn-1512",
    "wn-1392",
    "wn-1531",
    "wn-1532",
    "wn-1498",
    "wn-1474",
    "wn-1524",
    "wn-1449",
    "wn-1545",
    "wn-1547",
    "wn-1371",
    "wn-1500",
    "wn-1556",
    "wn-1455",
    "wn-1459",
    "wn-1473",
    "wn-1484",
    "wn-1505",
    "wn-1592",
    "wn-1529",
    "wn-1590",
    "wn-1600",
    "wn-1563",
    "wn-1488",
    "wn-1577",
    "wn-1540",
    "wn-1575",
    "wn-1548",
    "wn-1610",
    "wn-1544",
    "wn-1372",
    "wn-1465",
    "wn-1603",
    "wn-1672",
    "wn-1651",
    "wn-1655",
    "wn-1536",
    "wn-1682",
    "wn-1690",
    "wn-1668",
    "wn-1683",
    "wn-1686",
    "wn-1705",
    "wn-1664",
    "wn-1676",
    "wn-1486",
    "wn-1691",
    "wn-1701",
    "wn-1721",
    "wn-1422",
    "wn-1588",
    "wn-1684",
    "wn-1734",
    "wn-1750",
    "wn-1768",
    "wn-1757",
    "wn-1703",
    "wn-1770",
    "wn-1753",
    "wn-1735",
    "wn-1733",
    "wn-1694",
    "wn-1707",
    "wn-1778",
    "wn-1771",
    "wn-1847",
    "wn-1748",
    "wn-1581",
    "wn-1745",
    "wn-1716",
    "wn-1775",
    "wn-1492",
    "wn-1700",
    "wn-1756",
    "wn-1808",
    "wn-1859",
    "wn-1855",
    "wn-1850",
    "wn-1720",
    "wn-1787",
    "wn-1895",
    "wn-1893",
    "wn-1938",
    "wn-1939",
    "wn-1669",
    "wn-1749",
    "wn-1814",
    "wn-1914",
    "wn-1942",
    "wn-1949",
    "wn-1965",
    "wn-1899",
    "wn-1908",
    "wn-2026",
    "wn-2033",
    "wn-1925",
    "wn-1953",
    "wn-1955",
    "wn-1959",
    "wn-2008",
    "wn-1945",
    "wn-1994",
    "wn-2017",
    "wn-2077",
    "wn-2115",
    "wn-2110",
    "wn-2116",
    "wn-2123",
    "wn-2084",
    "wn-2124",
    "wn-2091",
    "wn-2081",
    "wn-2152",
    "wn-2164",
    "wn-2159",
    "wn-1986",
    "wn-2118",
    "wn-2146",
    "wn-2070",
    "wn-2003",
    "wn-2244",
    "wn-2200",
    "wn-2168",
    "wn-2230",
    "wn-2048",
    "wn-2254",
    "wn-2257",
    "wn-2270",
    "wn-2273",
    "wn-2166",
    "wn-2292",
    "wn-2304",
    "wn-2272",
    "wn-2246",
    "wn-2322",
    "wn-2282",
    "wn-2299",
    "wn-2279",
    "wn-2266",
    "wn-2362",
    "wn-2104",
    "wn-2301",
    "wn-2395",
    "wn-2258",
    "wn-2425",
    "wn-2202",
    "wn-2396",
    "wn-2437",
    "wn-2434",
    "wn-2447",
    "wn-2409",
    "wn-2285",
    "wn-2445",
    "wn-2428",
    "wn-2412",
    "wn-2488",
    "wn-2486",
    "wn-2309",
    "wn-2423",
    "wn-2431",
    "wn-2519",
    "wn-2491",
    "wn-2416",
    "wn-2467",
    "wn-2314",
    "wn-2487",
    "wn-2439",
    "wn-2527",
    "wn-2553",
    "wn-2470",
    "wn-2455",
    "wn-2565",
    "wn-2573",
    "wn-2582",
    "wn-2584",
    "wn-2458",
    "wn-2601",
    "wn-2568",
    "wn-2611",
    "wn-2609",
    "wn-2463",
    "wn-2620",
    "wn-2586",
    "wn-2640",
    "wn-2456",
    "wn-2523",
    "wn-2642",
    "wn-2644",
    "wn-2657",
    "wn-2645",
    "wn-2641",
    "wn-2658",
    "wn-2578",
    "wn-2705",
    "wn-2736",
    "wn-2741",
    "wn-2738",
    "wn-2743",
    "wn-2733",
    "wn-2749",
    "wn-2750",
    "wn-2729",
    "wn-2755",
    "wn-2748",
    "wn-2727",
    "wn-2868",
    "wn-2931",
    "wn-2949",
    "wn-2968",
    "wn-2992",
    "wn-2972",
    "wn-3008",
    "wn-3018",
    "wn-3012",
    "wn-3041",
    "wn-3038",
    "wn-3011",
    "wn-3087",
    "wn-3090",
    "wn-3093",
    "wn-3074",
    "wn-3097",
    "wn-3076",
    "wn-3065",
    "wn-3061",
    "wn-2978",
    "wn-3123",
    "wn-3098",
    "wn-3131",
    "wn-3130",
    "wn-3145",
    "wn-3121",
    "wn-3157",
    "wn-3155",
    "wn-3154",
    "wn-3188",
    "wn-3199",
    "wn-3213",
    "wn-3430",
    "wn-3446",
]


In [39]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-2273": "wm-2270",
        "wn-2048": "wn-1955",
        "wn-2230": "wn-1955",
        "wn-1707": "wn-1588",
        "wn-1684": "wn-1588",
        "wn-1700": "wn-XXXX19",
        "wn-1672": "wn-XXXX19",
        "wn-1437": "wn-1308",
        "wn-1295": "wn-1066",
        "wn-1111": "wn-919",
        "wn-1045": "wn-919",
        "wn-969": "wn-919",
        "wn-894": "wn-711",
        "wn-775": "wn-904",
        "wn-916": "wn-904",
        "wn-862": "wn-869",
        "wn-863": "wn-850",
        "wn-836": "wn-850",
        "wn-512": "wn-344",
        "wn-436": "wn-344",
        "wn-411": "wn-344",
        "wn-409": "wn-408",
        "wn-403": "wn-344",
        "wn-310": "wn-344",
        "wn-200": "wn-173",
        "wn-5": "wn-3",

    },
    "art_to_cls": {
        "6650974827": "wn-408",
        "6648443402": "wn-408",
        "6648475661": "wn-408",
        "6629152785": "wn-XXXXX1",
        "6634418743": "wn-XXXXX2",
        "6642836339": "wn-XXXXX3",
        "6643745064": "wn-XXXXX4",
        "6645017071": "wn-XXXXX5",
        "6646908558": "wn-XXXXX6",
        "6647107164": "wn-XXXXX7",
        "6656804472": "wn-XXXXX8",
        "6659533487": "wn-XXXXX9",
        "6659564172": "wn-XXXXX9",
        "6659732108": "wn-XXXXX9",
        "6659064267": "wn-XXXX10",
        "6660689118": "wn-XXXX11",
        "6660630392": "wn-XXXX12",
        "6659273461": "wn-XXXX13",
        "6660411762": "wn-XXXX14",
        "6659766529": "wn-XXXX15",
        "6659425406": "wn-XXXX15",
        "6660088910": "wn-XXXX16",
        "6660117310": "wn-XXXX16",
        "6660133830": "wn-XXXX16",
        "6660139081": "wn-XXXX16",
        "6660150866": "wn-XXXX16",
        "6660250610": "wn-XXXX16",
        "6660309649": "wn-XXXX17",
        "6660319291": "wn-XXXX17",
        "6660556082": "wn-XXXX18",
        "6660577967": "wn-XXXX18",
        "6660556082": "wn-XXXX18",
        "6660577967": "wn-XXXX18",
        "6660581446": "wn-XXXX18",
        "6660597114": "wn-XXXX18",
        "6660612547": "wn-XXXX18",
        "6660631749": "wn-XXXX18",
        "6660674788": "wn-XXXX18",
        "6660699090": "wn-XXXX18",
        "6660712508": "wn-XXXX18",
        "6660822248": "wn-XXXX18",
        "6660868076": "wn-XXXX18",
        "6660896962": "wn-XXXX18",
        "6660899803": "wn-XXXX18",
        "6660955098": "wn-XXXX18",
        "6660994355": "wn-XXXX18",
        "6661065148": "wn-XXXX18",
        "6661065207": "wn-XXXX18",
        "6660781643": "wn-XXXX19",
        "6660832079": "wn-XXXX19",
        "6660547145": "wn-XXXX19",
        "6660662178": "wn-XXXX18",
        "6660687098": "wn-XXXX18",
        "6660746981": "wn-XXXX18",
        "6660897083": "wn-XXXX18",
        "6661050539": "wn-XXXX18",
        "6661050555": "wn-XXXX18",
        "6661061749": "wn-XXXX18",
        "6662284743": "wn-XXXX19",
        "6660289452": "wn-XXXX20",
        "6660304867": "wn-XXXX20",
        "6660323903": "wn-XXXX20",
        "6660523825": "wn-XXXX20",
        "6660604493": "wn-XXXX20",

        "6660917780": "wn-XXXX21",
        "6661339164": "wn-XXXX21",
        "6661463756": "wn-XXXX21",
        "6661474269": "wn-XXXX21",
        "6661660289": "wn-XXXX21",
        "6661746991": "wn-XXXX21",
        "6661996822": "wn-XXXX21",

        "6661592137": "wn-XXXX22",

        "6661491976": "wn-XXXX23",
        "6661332643": "wn-XXXX23",
        "6661251256": "wn-XXXX23",

        "6662788044": "wn-XXXX24",
        "6662809594": "wn-XXXX24",
        "6662815500": "wn-XXXX24",

        "6662825637": "wn-XXXX25",
        "6662867764": "wn-XXXX25",

        "6665785401": "wn-XXXX26",
        "6665826114": "wn-XXXX26",

        "6667112229": "wn-XXXX27",
        "6667131015": "wn-XXXX27",
    },
    "remove_cls": [

    ],
    "remove_art": [
        "6637053687",
        "6647822275",
        "6646831433",
        "6655155662",
        "6659549144",
        "6657935078",
        "6657522940",
        "6657516964",
        "6655966035",
        "6659435420",
        "6659578059",
        "6659798600",
        "6659822210",
        "6659064267",
        "6660689118",
        "6661486928",
        "6658787367",
        "6661482049",
        "6660870642",
        "6660855527",
        "6662251146",
        "6662316606",
        "6661712092",
        "6662647910",
        "6663130390",
        "6663963629",
        "6666207469",
        "6666545856",
        "6666556001",
        "6666597584",
        "6680913430",

    ]
}

## True Labels (Manual) Preparation

In [40]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [41]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
10,1,Сборная Самоа будет представлена на Играх спор...,Ранее министр связи Афамасагу Рико Тупаи заяви...,rus,ТАСС,2021-07-01 08:47:00,https://tass.ru/sport/11794269,6628456790,,"['olympic_games', 'japan', 'judo']",wn-1
12,6,Samoa to send only overseas-based athletes to ...,"Apia, Samoa: Samoa's Olympic committee withdre...",eng,Firstpost,2021-07-01 12:04:00,https://www.firstpost.com/sports/tokyo-olympic...,6628672841,eng-6907868,"['olympic_games', 'japan', 'judo']",wn-1
26,7,International Olympic Committee is coming unde...,Grenfell Athletic: The club providing solace t...,eng,CNN International,2021-07-01 12:26:00,https://edition.cnn.com/2021/07/01/sport/irani...,6628699172,,"['olympic_games', 'japan', 'judo']",wn-2
27,9,Tokyo 2020: International Olympic Committee is...,Athletes are trained to be tough; they've lear...,eng,MSN International Edition,2021-07-01 12:52:00,https://www.msn.com/en-us/news/us/tokyo-2020-i...,6628732642,,"['olympic_games', 'japan', 'judo']",wn-2
11,10,Сборная Самоа будет представлена на Играх спор...,Ранее министр связи страны Афамасагу Рико Тупа...,rus,Спорт Mail.ru,2021-07-01 13:05:00,https://sportmail.ru/news/olympics/46943326/,6628751546,,"['olympic_games', 'japan', 'judo']",wn-1
...,...,...,...,...,...,...,...,...,...,...,...
9960,9720,Olympics to replace Japanese athlete's gold me...,The medal-bite is a familiar photo opportunity...,eng,AM 740 KVOR | KVOR-AM,2021-08-13 14:03:00,https://www.kvor.com/news/olympics-to-replace-...,6686699576,eng-7017270,"['olympic_games', 'japan', 'judo']",wn-494
9927,9726,Juegos Olímpicos ganaron aceptación entre much...,"Antes, muchos japoneses mostraron su reticenci...",spa,www.diariolibre.com,2021-08-13 16:36:00,https://www.diariolibre.com/deportes/olimpismo...,6686899243,spa-2454564,"['olympic_games', 'japan', 'judo']",wn-493
9928,9727,"Japón, al final, se dejó 'seducir' por los Jue...","Por ahora, hay en muchos japoneses un sentimie...",spa,El Financiero,2021-08-13 19:09:00,https://www.elfinanciero.com.mx/tokio-2020/202...,6687117130,spa-2454564,"['olympic_games', 'japan', 'judo']",wn-493
9961,9738,El alcalde japonés que le tocó ofrecer disculp...,"Por una escena transmitida por televisión, que...",spa,Récord,2021-08-14 00:47:00,https://record.acento.com.do/polideportivo/el-...,6687444123,spa-2455119,"['olympic_games', 'japan', 'judo']",wn-494


## Subset Evaluation

### Clustering Evaluation

In [42]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [43]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.8289588591499285,
 'P': 0.9814570740353589,
 'R': 0.7174776151635464,
 'A': 0.9976340169501717}

#### Evaluation using Event Registry

In [44]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.07880207619974212,
 'P': 0.04593015093498207,
 'R': 0.277173281060518,
 'A': 0.9482151981256922}

### Statistics

In [45]:
true_df["lang"].value_counts()

lang
eng    1104
por     985
spa     634
fra     499
rus     247
deu     202
ara     151
slv      56
zho       4
Name: count, dtype: int64

In [46]:
true_df["clusterId"].value_counts()

clusterId
wn-157    166
wn-56     137
wn-138    125
wn-365     67
wn-222     64
         ... 
wn-16       1
wn-178      1
wn-139      1
wn-45       1
wn-221      1
Name: count, Length: 494, dtype: int64

## Save Evaluation Results

In [47]:
save_df_to_csv(true_df, FILE_NAME)

In [48]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Rowing

In [49]:
FILE_NAME = "olympic_games__japan__rowing.csv"

In [50]:
df, events = load_events(FILE_NAME, processed_uris)

In [51]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
4,0,Marksman Hoang Xuan Vinh to compete at upcomin...,VOV.VN - Marksman Hoang Xuan Vinh is set to re...,eng,VOV - VOV Online Newspaper,2021-07-01 07:56:00,https://english.vov.vn/en/sports/marksman-hoan...,6628411298,,"['olympic_games', 'japan', 'rowing']",wn-3
3,1,"Aina Cid: ""Volveré satisfecha si doy el 100%""","""La experiencia de Río 2016 la recuerdo como u...",spa,Diario Sport,2021-07-01 08:50:00,https://www.sport.es/es/noticias/juegos-olimpi...,6628459127,,"['olympic_games', 'japan', 'rowing']",wn-2
7,2,Sky Brown to be Britain's youngest summer Olym...,Skateboarder Sky Brown will be Britain's young...,eng,RTL Today,2021-07-01 17:13:00,https://today.rtl.lu/sport/international/a/174...,6629110643,eng-6909027,"['olympic_games', 'japan', 'rowing']",wn-6
9,3,Más de uno Cantabria. Deporte 01/07/2021,Entrevista al atleta Carlos Tobalina y al pali...,spa,OndaCero,2021-07-01 17:56:00,https://www.ondacero.es/emisoras/cantabria/aud...,6629174951,,"['olympic_games', 'japan', 'rowing']",wn-7
8,4,Skateboarder Sky Brown to become Britain's you...,READ | CASTER SEMENYA RUNS OUT OF TIME TO QUAL...,eng,The South African,2021-07-01 19:14:00,https://www.thesouthafrican.com/sport/tokyo-ol...,6629282976,eng-6909027,"['olympic_games', 'japan', 'rowing']",wn-6
...,...,...,...,...,...,...,...,...,...,...,...
4997,4351,Paralympics in Tokio: Die wichtigsten Daten un...,Die XVI. Paralympischen Sommerspiele werden in...,deu,tokio.sportschau.de,2021-08-20 16:37:00,https://tokio.sportschau.de/tokio2020/paralymp...,6695809892,,"['olympic_games', 'japan', 'rowing']",wn-2201
4994,4352,Isaquias Queiroz inspira indígenas da Amazônia...,Ouro conquistado em Tóquio pelo brasileiro ali...,por,Terra,2021-08-20 17:20:00,https://www.terra.com.br/esportes/jogos-olimpi...,6695861385,por-671052,"['olympic_games', 'japan', 'rowing']",wn-2199
5009,4353,Guía para seguir los Juegos Paralímpicos de To...,La competición tendrá lugar del 24 de agosto a...,spa,elEconomista.es,2021-08-20 19:10:00,https://ecoteuve.eleconomista.es/ecoteuve/depo...,6695994196,spa-2463699,"['olympic_games', 'japan', 'rowing']",wn-2208
5010,4354,RTVE anuncia su programación y comentaristas p...,"-- Paloma del Río, homenajeada en 'Días de ver...",spa,vertele,2021-08-20 21:18:00,https://vertele.eldiario.es/noticias/jjoo-para...,6696135968,spa-2463699,"['olympic_games', 'japan', 'rowing']",wn-2208


In [52]:
df["lang"].value_counts()

lang
eng    1901
spa     942
deu     388
rus     326
por     325
ara     226
fra     214
slv      17
zho      17
Name: count, dtype: int64

In [53]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-6       eng-6909027 : 6629110643  eng  2021-07-01 19:13:00 - Sky Brown to be Britain's youngest summer Olympian
2     :  wn-6       eng-6909027 : 6629282976  eng  2021-07-01 21:14:00 - Skateboarder Sky Brown to become Britain's youngest summer Olympian - at 13!
-------------------------------------------------------------------
3     :  wn-20      fra-752445  : 6630890157  fra  2021-07-02 22:39:00 - La liste du Togo pour les Jeux Olympiques de Tokyo
4     :  wn-20      fra-752445  : 6632913417  fra  2021-07-04 18:59:00 - Jeux Olympiques: 5 athlètes pour représenter le Togo à Tokyo
-------------------------------------------------------------------
5     :  wn-21      None        : 6631165749  eng  2021-07-03 03:16:00 - Tokyo Olympics: All the Jewish athletes to watch - Jewish Telegraphic Agency
6     :  wn-21      None        : 6631273565  eng  2021-07-03 05:43:00 - Tokyo Olympics: All the Jewish athletes t

Check the values of the specific record

In [54]:
df[df["uri"] == "6662421822"].to_dict("records")

[{'id': 2200,
  'title': 'Teens fill skateboard podium in Tokyo',
  'body': 'Generation next has arrived in Tokyo, and the Olympics as we know them will never be the same again. A pair of 13-year-olds and another aged just 16 completed an all-teen podium in the women\'s street skateboarding - pint-sized Momiji Nishiya taking gold to make it a Japanese double after Yuto Horigome\'s triumph in the men\'s event on Sunday. At 13 and 330 days, Nishiya is the third-youngest gold medalist in Summer Olympic history, bettered only by German rower Klaus Zerta in 1960 and US diver Marjorie Gestring ion 1936. Brazilian 13-year-old Rayssa Leal claimed silver and Funa Nakayama, 16, the bronze in an epic final - with just 0.77 points separating the three competitors in the event\'s Olympic debut. "I welled up in tears because I was beyond happy," Nishiya said of the moment she realised she had won. "I\'m so happy to win the Olympics in Japan, and I\'m so happy to win my first Olympics as one of the y

In [55]:
# ended at row: completed
selected_cls = [
    "wn-6",
    "wn-20",
    "wn-21",
    "wn-37",
    "wn-36",
    "wn-41",
    "wn-56",
    "wn-58",
    "wn-64",
    "wn-90",
    "wn-95",
    "wn-94",
    "wn-122",
    "wn-125",
    "wn-130",
    "wn-150",
    "wn-177",
    "wn-192",
    "wn-209",
    "wn-216",
    "wn-226",
    "wn-237",
    "wn-244",
    "wn-265",
    "wn-291",
    "wn-304",
    "wn-329",
    "wn-373",
    "wn-379",
    "wn-408",
    "wn-407",
    "wn-412",
    "wn-414",
    "wn-419",
    "wn-439",
    "wn-444",
    "wn-457",
    "wn-464",
    "wn-479",
    "wn-539",
    "wn-541",
    "wn-546",
    "wn-549",
    "wn-590",
    "wn-610",
    "wn-614",
    "wn-616",
    "wn-621",
    "wn-620",
    "wn-623",
    "wn-625",
    "wn-632",
    "wn-640",
    "wn-654",
    "wn-663",
    "wn-666",
    "wn-674",
    "wn-681",
    "wn-680",
    "wn-685",
    "wn-700",
    "wn-703",
    "wn-717",
    "wn-742",
    "wn-755",
    "wn-766",
    "wn-767",
    "wn-770",
    "wn-769",
    "wn-778",
    "wn-783",
    "wn-786",
    "wn-791",
    "wn-774",
    "wn-795",
    "wn-796",
    "wn-805",
    "wn-804",
    "wn-668",
    "wn-808",
    "wn-815",
    "wn-919",
    "wn-928",
    "wn-934",
    "wn-950",
    "wn-949",
    "wn-953",
    "wn-933",
    "wn-957",
    "wn-961",
    "wn-970",
    "wn-1014",
    "wn-1023",
    "wn-1034",
    "wn-1049",
    "wn-1063",
    "wn-1070",
    "wn-1069",
    "wn-948",
    "wn-1073",
    "wn-1077",
    "wn-1076",
    "wn-1078",
    "wn-1080",
    "wn-1081",
    "wn-1085",
    "wn-1086",
    "wn-1090",
    "wn-1092",
    "wn-1098",
    "wn-1099",
    "wn-1152",
    "wn-1155",
    "wn-1179",
    "wn-1185",
    "wn-1190",
    "wn-1196",
    "wn-1198",
    "wn-1203",
    "wn-1206",
    "wn-1211",
    "wn-1252",
    "wn-1271",
    "wn-1270",
    "wn-1272",
    "wn-1275",
    "wn-1279",
    "wn-1280",
    "wn-1290",
    "wn-1294",
    "wn-1295",
    "wn-1301",
    "wn-1302",
    "wn-1332",
    "wn-1337",
    "wn-1340",
    "wn-1348",
    "wn-1377",
    "wn-1375",
    "wn-1373",
    "wn-1379",
    "wn-1378",
    "wn-1382",
    "wn-1381",
    "wn-1389",
    "wn-1403",
    "wn-1412",
    "wn-1415",
    "wn-1417",
    "wn-1425",
    "wn-1428",
    "wn-1432",
    "wn-1436",
    "wn-1437",
    "wn-1441",
    "wn-1485",
    "wn-1497",
    "wn-1518",
    "wn-1535",
    "wn-1369",
    "wn-1547",
    "wn-1555",
    "wn-1609",
    "wn-1625",
    "wn-1634",
    "wn-1633",
    "wn-1629",
    "wn-1637",
    "wn-1656",
    "wn-1670",
    "wn-1687",
    "wn-1686",
    "wn-1694",
    "wn-1743",
    "wn-1803",
    "wn-1813",
    "wn-1825",
    "wn-1830",
    "wn-1839",
    "wn-1859",
    "wn-1856",
    "wn-1861",
    "wn-1863",
    "wn-1865",
    "wn-1868",
    "wn-1884",
    "wn-1911",
    "wn-1909",
    "wn-1928",
    "wn-1954",
    "wn-1968",
    "wn-1969",
    "wn-1983",
    "wn-2010",
    "wn-2013",
    "wn-2014",
    "wn-2027",
    "wn-2042",
    "wn-2043",
    "wn-2049",
    "wn-2052",
    "wn-2059",
    "wn-2111",
    "wn-2114",
    "wn-2115",
    "wn-2135",
]

In [56]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-2114": "wn-2111",
        "wn-1270": "wn-1271",
        "wn-1206": "wn-1034",
        "wn-1203": "wn-1034",
        "wn-1196": "wn-1034",
        "wn-1190": "wn-755",
        "wn-1185": "wn-1034",
        "wn-1179": "wn-755",
        "wn-1014": "wn-755",
        "wn-953": "wn-755",
        "wn-822": "wn-674",
        "wn-791": "wn-674",
        "wn-778": "wn-674",
        "wn-766": "wn-755",
        "wn-90": "wn-58",
        "wn-209": "wn-90",
        "wn-36": "wn-37",
    },
    "art_to_cls": {
        "6661431396": "wn-XXXXX1",
        "6661459847": "wn-XXXXX1",
        "6661965539": "wn-XXXXX1",
        "6662141211": "wn-XXXXX1",

        "6661332367": "wn-XXXXX2",
        "6661346283": "wn-XXXXX2",
        "6661391659": "wn-XXXXX2",

        "6661319845": "wn-XXXXX3",
        "6661346284": "wn-XXXXX3",

        "6664239377": "wn-XXXXX4",
        "6664302215": "wn-XXXXX4",

        "6664344804": "wn-XXXXX5",
        "6664369831": "wn-XXXXX5",

        "6663606036": "wn-XXXXX6",
        "6663619174": "wn-XXXXX6",

        "6663645889": "wn-XXXXX7",

        "6665152337": "wn-XXXXX8",
        "6666552523": "wn-XXXXX8",
        "6666759393": "wn-XXXXX8",
        "6666953322": "wn-XXXXX8",
        "6667247305": "wn-XXXXX8",
        "6667680652": "wn-XXXXX8",
        "6668041814": "wn-XXXXX8",

        "6667358492": "wn-XXXXX9",
        "6667363263": "wn-XXXXX9",
        "6667443167": "wn-XXXXX9",
        "6667696686": "wn-XXXXX9",

        "6665384229": "wn-XXXX10",
        "6665390974": "wn-XXXX10",

        "6665878297": "wn-XXXX11",
        "6665896039": "wn-XXXX11",
        "6665917569": "wn-XXXX11",
        "6666044044": "wn-XXXX11",

        "6667133104": "wn-XXXX12",
        "6667199335": "wn-XXXX12",
        "6667484152": "wn-XXXX12",
        "6667543056": "wn-XXXX12",
    },
    "remove_cls": [

    ],
    "remove_art": [
        "6658387546",
        "6658608846",
        "6659064011",
        "6659072720",
        "6659679009",
        "6659788981",
        "6659776412",
        "6658251860",
        "6659589831",
        "6663944897",
        "6667680652",
        "6667732800",
        "6665634104",
        "6666126385",
        "6681856513",
        "6680418983",
        "6679951025",
    ]
}

## True Labels (Manual) Preparation

In [57]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [58]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
7,2,Sky Brown to be Britain's youngest summer Olym...,Skateboarder Sky Brown will be Britain's young...,eng,RTL Today,2021-07-01 17:13:00,https://today.rtl.lu/sport/international/a/174...,6629110643,eng-6909027,"['olympic_games', 'japan', 'rowing']",wn-1
8,4,Skateboarder Sky Brown to become Britain's you...,READ | CASTER SEMENYA RUNS OUT OF TIME TO QUAL...,eng,The South African,2021-07-01 19:14:00,https://www.thesouthafrican.com/sport/tokyo-ol...,6629282976,eng-6909027,"['olympic_games', 'japan', 'rowing']",wn-1
25,10,La liste du Togo pour les Jeux Olympiques de T...,Le Comité national olympique du Togo (CNOT) a ...,fra,Bénin Web TV,2021-07-02 20:39:00,https://beninwebtv.bj/2021/07/la-liste-du-togo...,6630890157,fra-752445,"['olympic_games', 'japan', 'rowing']",wn-2
27,11,Tokyo Olympics: All the Jewish athletes to wat...,(JTA) -- The 2020 Tokyo Olympics are finally h...,eng,Jewish Telegraphic Agency,2021-07-03 01:16:00,https://www.jta.org/2021/07/02/sports/tokyo-ol...,6631165749,,"['olympic_games', 'japan', 'rowing']",wn-3
28,12,Tokyo Olympics: All the Jewish athletes to watch,"The 2020 Tokyo Olympics are finally happening,...",eng,The Jerusalem Post,2021-07-03 03:43:00,https://www.jpost.com/international/tokyo-olym...,6631273565,,"['olympic_games', 'japan', 'rowing']",wn-3
...,...,...,...,...,...,...,...,...,...,...,...
4898,4240,"""Embarrassing"" move: No backdown on four-week ...",Australian state officials defended their deci...,eng,RTL Today,2021-08-12 08:40:00,https://today.rtl.lu/news/world/a/1769405.html,6684957998,eng-7014065,"['olympic_games', 'japan', 'rowing']",wn-203
4899,4242,South Australian premier refuses to budge on q...,By Kylie Stevens For Daily Mail Australia and ...,eng,Daily Mail Online,2021-08-12 10:52:00,https://www.dailymail.co.uk/news/article-98861...,6685078260,eng-7014065,"['olympic_games', 'japan', 'rowing']",wn-203
4900,4243,Uproar over order to put Aussie Olympians in f...,The South Australian government is also insist...,eng,Nation,2021-08-12 11:09:00,https://nation.africa/kenya/sports/uproar-over...,6685097491,eng-7017964,"['olympic_games', 'japan', 'rowing']",wn-203
4901,4244,No backdown on four-week quarantine for Aussie...,"High-profile sports star branded the move ""dis...",eng,The Express Tribune,2021-08-12 11:45:00,http://tribune.com.pk/story/2315218/no-backdow...,6685136635,eng-7017964,"['olympic_games', 'japan', 'rowing']",wn-203


## Subset Evaluation

### Clustering Evaluation

In [59]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [60]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.5934480320527928,
 'P': 0.974332516445247,
 'R': 0.4266591358373341,
 'A': 0.986103838562855}

#### Evaluation using Event Registry

In [61]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.15464438220303964,
 'P': 0.13027501643909797,
 'R': 0.19022874894097713,
 'A': 0.9505618882668063}

### Statistics

In [62]:
true_df["lang"].value_counts()

lang
eng    671
spa    252
por    108
deu     66
rus     54
fra     36
ara     28
zho      3
slv      3
Name: count, dtype: int64

In [63]:
true_df["clusterId"].value_counts()

clusterId
wn-63     147
wn-55      69
wn-87      54
wn-189     25
wn-4       22
         ... 
wn-164      2
wn-124      2
wn-22       2
wn-1        2
wn-118      1
Name: count, Length: 203, dtype: int64

## Save Evaluation Results

In [64]:
save_df_to_csv(true_df, FILE_NAME)

In [65]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Skateboarding

In [66]:
FILE_NAME = "olympic_games__japan__skateboarding.csv"

In [67]:
df, events = load_events(FILE_NAME, processed_uris)

In [68]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
0,0,Sky Brown to make British Olympic history as s...,Brown will be 13 years and 11 days old when sh...,eng,The Argus,2021-07-01 15:44:00,https://www.theargus.co.uk/sport/national/1941...,6628981713,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
1,1,Sky Brown: Skateboarder to break Team GB's you...,Sky Brown: Skateboarder to break Team GB's you...,eng,SkySports,2021-07-01 16:14:00,https://www.skysports.com/olympics/news/15234/...,6629024638,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
2,2,Sky Brown to make British Olympic history as s...,Brown will be 13 years and 11 days old when sh...,eng,Shropshire Star,2021-07-01 16:19:00,https://www.shropshirestar.com/sport/uk-sports...,6629032734,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
3,3,Sky Brown to make British Olympic history as s...,Skateboarder Sky Brown will become Great Brita...,eng,Evening Express,2021-07-01 16:21:00,https://www.eveningexpress.co.uk/sport/sky-bro...,6629034816,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
4,4,Skateboarder to be Britain's youngest Summer O...,LONDON (AP) -- Skateboarder Sky Brown will bec...,eng,WTOP,2021-07-01 17:09:00,https://wtop.com/asia/2021/07/skateboarder-to-...,6629106052,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
...,...,...,...,...,...,...,...,...,...,...,...
6521,5496,Fadinha tira foto com a camisa do Corinthians ...,"Rayssa Leal, a Fadinha do Skate, voltou a demo...",por,Meu Tim�o,2021-08-20 18:38:00,https://www.meutimao.com.br/noticias-do-corint...,6695957031,,"['olympic_games', 'japan', 'skateboarding']",wn-2342
6522,5497,AP Sportlight,{{featured_button_text}} Aug. 21 1901 -- Willi...,eng,Mooresville Tribune,2021-08-20 19:00:00,https://mooresvilletribune.com/sports/ap-sport...,6695981706,eng-7042293,"['olympic_games', 'japan', 'skateboarding']",wn-2343
6523,5498,"Брейк-данс, серфинг, скалолазание: насколько г...",Международный олимпийский комитет (МОК) предст...,rus,Слово и Дело новости,2021-08-20 21:16:00,https://slovodel.com/616714-breik-dans-serfing...,6696133301,,"['olympic_games', 'japan', 'skateboarding']",wn-2344
6524,5499,Sound for the Tokyo Olympics both 'a challenge...,Immersive audio debuts while some augmented cr...,eng,SVG Europe,2021-08-20 23:44:00,https://www.svgeurope.org/blog/headlines/sound...,6696268111,,"['olympic_games', 'japan', 'skateboarding']",wn-2345


In [69]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-1       eng-6909027 : 6628981713  eng  2021-07-01 17:44:00 - Sky Brown to make British Olympic history as skateboarder is confirmed for Tokyo
2     :  wn-1       eng-6909027 : 6629024638  eng  2021-07-01 18:14:00 - Sky Brown: Skateboarder to break Team GB's youngest summer Olympian record at Tokyo 2020
3     :  wn-1       eng-6909027 : 6629032734  eng  2021-07-01 18:19:00 - Sky Brown to make British Olympic history as skateboarder is confirmed for Tokyo
4     :  wn-1       eng-6909027 : 6629034816  eng  2021-07-01 18:21:00 - Sky Brown to make British Olympic history as skateboarder is confirmed for Tokyo - Evening Express
5     :  wn-1       eng-6909027 : 6629106052  eng  2021-07-01 19:09:00 - Skateboarder to be Britain's youngest Summer Olympian | WTOP
6     :  wn-1       eng-6909027 : 6629126198  eng  2021-07-01 19:23:00 - Skateboarding: Brown to be Britain's youngest summer Olympian at 13
7     :  wn-1  

Check the values of the specific record

In [70]:
df[df["uri"] == "6683343692"].to_dict("records")

[{'id': 5318,
  'title': "Mattel admits Tokyo Olympics Barbies 'fell short' of representing Asian community",
  'body': 'Mattel is owning up to its mistakes after Twitter users accused the toymaker of excluding Barbies of Asian descent from its new Tokyo Olympics play set. In a statement provided Tuesday to the Los Angeles Times, the company clarified that a doll resembling an Olympic skateboarder was intended to "represent the Asian community" -- but acknowledged that the collection ultimately failed to meet its goal. "Fostering a more inclusive world is at the heart of our brand and we strive to reflect that in our Barbie product line," the statement read. "With our Barbie Olympic Games Tokyo 2020 dolls, we celebrate a range of athletes to inspire kids to find their athlete within. "However, our intention to represent the Asian community with the Skateboarder doll fell short and we fully receive and recognize the feedback. Moving forward, we will work to find more ways to champion al

In [71]:
# ended at row: completed
selected_cls = [
    "wn-1",
    "wn-10",
    "wn-35",
    "wn-39",
    "wn-43",
    "wn-56",
    "wn-65",
    "wn-68",
    "wn-73",
    "wn-95",
    "wn-116",
    "wn-123",
    "wn-138",
    "wn-150",
    "wn-172",
    "wn-176",
    "wn-213",
    "wn-216",
    "wn-223",
    "wn-224",
    "wn-244",
    "wn-245",
    "wn-100",
    "wn-252",
    "wn-260",
    "wn-284",
    "wn-289",
    "wn-294",
    "wn-302",
    "wn-333",
    "wn-338",
    "wn-335",
    "wn-340",
    "wn-344",
    "wn-351",
    "wn-354",
    "wn-358",
    "wn-363",
    "wn-366",
    "wn-371",
    "wn-381",
    "wn-383",
    "wn-386",
    "wn-408",
    "wn-416",
    "wn-415",
    "wn-420",
    "wn-422",
    "wn-435",
    "wn-436",
    "wn-442",
    "wn-445",
    "wn-448",
    "wn-452",
    "wn-459",
    "wn-456",
    "wn-476",
    "wn-488",
    "wn-489",
    "wn-492",
    "wn-491",
    "wn-490",
    "wn-506",
    "wn-516",
    "wn-393",
    "wn-533",
    "wn-540",
    "wn-603",
    "wn-618",
    "wn-621",
    "wn-624",
    "wn-638",
    "wn-637",
    "wn-648",
    "wn-658",
    "wn-662",
    "wn-667",
    "wn-675",
    "wn-684",
    "wn-678",
    "wn-684",
    "wn-687",
    "wn-691",
    "wn-708",
    "wn-715",
    "wn-723",
    "wn-725",
    "wn-730",
    "wn-741",
    "wn-745",
    "wn-744",
    "wn-746",
    "wn-755",
    "wn-758",
    "wn-756",
    "wn-760",
    "wn-759",
    "wn-763",
    "wn-764",
    "wn-767",
    "wn-770",
    "wn-773",
    "wn-772",
    "wn-774",
    "wn-785",
    "wn-793",
    "wn-792",
    "wn-807",
    "wn-812",
    "wn-809",
    "wn-819",
    "wn-823",
    "wn-826",
    "wn-845",
    "wn-854",
    "wn-856",
    "wn-882",
    "wn-895",
    "wn-896",
    "wn-907",
    "wn-919",
    "wn-920",
    "wn-916",
    "wn-925",
    "wn-922",
    "wn-927",
    "wn-932",
    "wn-934",
    "wn-936",
    "wn-940",
    "wn-945",
    "wn-965",
    "wn-983",
    "wn-982",
    "wn-993",
    "wn-1001",
    "wn-1012",
    "wn-1022",
    "wn-1027",
    "wn-1039",
    "wn-1036",
    "wn-1065",
    "wn-1073",
    "wn-1117",
    "wn-1120",
    "wn-1122",
    "wn-1124",
    "wn-1130",
    "wn-1166",
    "wn-1169",
    "wn-1180",
    "wn-1208",
    "wn-1203",
    "wn-1223",
    "wn-1215",
    "wn-1230",
    "wn-1259",
    "wn-1263",
    "wn-1293",
    "wn-1296",
    "wn-1349",
    "wn-1362",
    "wn-1367",
    "wn-1470",
    "wn-1472",
    "wn-1496",
    "wn-1507",
    "wn-1523",
    "wn-1527",
    "wn-1530",
    "wn-1533",
    "wn-1540",
    "wn-1542",
    "wn-1546",
    "wn-1550",
    "wn-1557",
    "wn-1559",
    "wn-1567",
    "wn-1570",
    "wn-1595",
    "wn-1604",
    "wn-1609",
    "wn-1616",
    "wn-1617",
    "wn-1626",
    "wn-1629",
    "wn-1561",
    "wn-1628",
    "wn-1636",
    "wn-1633",
    "wn-1639",
    "wn-1638",
    "wn-1656",
    "wn-1648",
    "wn-1651",
    "wn-1646",
    "wn-1660",
    "wn-1658",
    "wn-1662",
    "wn-1663",
    "wn-1668",
    "wn-1664",
    "wn-1689",
    "wn-1688",
    "wn-1685",
    "wn-1684",
    "wn-1692",
    "wn-1697",
    "wn-1705",
    "wn-1722",
    "wn-1730",
    "wn-1731",
    "wn-1752",
    "wn-1743",
    "wn-1747",
    "wn-1750",
    "wn-1792",
    "wn-1788",
    "wn-1782",
    "wn-1799",
    "wn-1798",
    "wn-1795",
    "wn-1794",
    "wn-1806",
    "wn-1804",
    "wn-1812",
    "wn-1813",
    "wn-1826",
    "wn-1824",
    "wn-1841",
    "wn-1845",
    "wn-1849",
    "wn-1846",
    "wn-1863",
    "wn-1861",
    "wn-1869",
    "wn-1865",
    "wn-1876",
    "wn-1874",
    "wn-1882",
    "wn-1927",
    "wn-1929",
    "wn-1944",
    "wn-1954",
    "wn-1958",
    "wn-2005",
    "wn-2020",
    "wn-2031",
    "wn-2013",
    "wn-2036",
    "wn-2052",
    "wn-2051",
    "wn-2067",
    "wn-2079",
    "wn-2121",
    "wn-2133",
    "wn-2142",
    "wn-2158",
    "wn-2217",
    "wn-2274",
]

In [72]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-1743": "wn-1688",
        "wn-1530": "wn-1550",
        "wn-1065": "wn-920",
        "wn-965": "wn-920",
        "wn-945": "wn-925",
        "wn-940": "wn-925",
        "wn-934": "wn-925",
        "wn-809": "wn-773",
        "wn-723": "wn-488", # doodle
        "wn-662": "wn-658",
        "wn-540": "wn-516",
        "wn-506": "wn-488",
        "wn-491": "wn-252",
        "wn-383": "wn-351",
        "wn-363": "wn-354",
        "wn-284": "wn-176",
        "wn-100": "wn-245",
        "wn-244": "wn-176",
        "wn-224": "wn-176",
        "wn-223": "wn-1",
        "wn-216": "wn-176",
        "wn-172": "wn-1",
        "wn-95": "wn-56",
        "wn-68": "wn-56",
    },
    "art_to_cls": {
        "6663973498": "wn-XXXXX3",
        "6663701341": "wn-XXXXX2",
        "6664581188": "wn-XXXXX1",
        "6661118752": "wn-744",
        "6663134088": "wn-684",
        "6657445673": "wn-448",
        "6657378902": "wn-442",
    },
    "remove_cls": [

    ],
    "remove_art": [
        "6674345134",
        "6659692164",
        "6659596778",
        "6653569454",
        "6658553202",
        "6657445673",
        "6661029532",
        "6663469335",
        "6663435355",
    ]
}

## True Labels (Manual) Preparation

In [73]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [74]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
0,0,Sky Brown to make British Olympic history as s...,Brown will be 13 years and 11 days old when sh...,eng,The Argus,2021-07-01 15:44:00,https://www.theargus.co.uk/sport/national/1941...,6628981713,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
1,1,Sky Brown: Skateboarder to break Team GB's you...,Sky Brown: Skateboarder to break Team GB's you...,eng,SkySports,2021-07-01 16:14:00,https://www.skysports.com/olympics/news/15234/...,6629024638,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
2,2,Sky Brown to make British Olympic history as s...,Brown will be 13 years and 11 days old when sh...,eng,Shropshire Star,2021-07-01 16:19:00,https://www.shropshirestar.com/sport/uk-sports...,6629032734,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
3,3,Sky Brown to make British Olympic history as s...,Skateboarder Sky Brown will become Great Brita...,eng,Evening Express,2021-07-01 16:21:00,https://www.eveningexpress.co.uk/sport/sky-bro...,6629034816,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
4,4,Skateboarder to be Britain's youngest Summer O...,LONDON (AP) -- Skateboarder Sky Brown will bec...,eng,WTOP,2021-07-01 17:09:00,https://wtop.com/asia/2021/07/skateboarder-to-...,6629106052,eng-6909027,"['olympic_games', 'japan', 'skateboarding']",wn-1
...,...,...,...,...,...,...,...,...,...,...,...
6418,5390,"Em votação popular, Rayssa Leal ganha prêmio d...",Honraria é entregue pelo Comitê Olímpico Inter...,por,SuperesportesMG,2021-08-13 00:23:00,https://www.mg.superesportes.com.br/app/notici...,6686007661,por-667934,"['olympic_games', 'japan', 'skateboarding']",wn-237
6415,5391,Rayssa Leal conquista prêmio do COI por espíri...,"A medalhista de prata no skate street, e uma d...",por,O TEMPO,2021-08-13 01:07:00,https://www.otempo.com.br/superfc/rayssa-leal-...,6686044287,por-667934,"['olympic_games', 'japan', 'skateboarding']",wn-237
6419,5392,Rayssa Leal vence prêmio do COI por melhor rep...,"Prata em Tóquio, skatista Rayssa Leal vai rece...",por,Nominuto.com,2021-08-13 01:26:00,https://nominuto.com/noticias/esporte/rayssa-l...,6686061052,por-667934,"['olympic_games', 'japan', 'skateboarding']",wn-237
6420,5398,"Rayssa Leal, a 'fadinha' do skate, ganha prêmi...",A atleta foi um dos destaques na disputa do sk...,por,R7 Notícias,2021-08-13 06:03:00,http://noticias.r7.com/jr-na-tv/videos/rayssa-...,6686252492,por-667934,"['olympic_games', 'japan', 'skateboarding']",wn-237


## Subset Evaluation

### Clustering Evaluation

In [75]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [76]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.9150216630067067,
 'P': 0.9991186458252327,
 'R': 0.8439826574406587,
 'A': 0.9970538003465149}

#### Evaluation using Event Registry

In [77]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.20925035258380648,
 'P': 0.1562253636726892,
 'R': 0.31676447402995533,
 'A': 0.9550053294154928}

### Statistics

In [78]:
true_df["lang"].value_counts()

lang
eng    1049
por     822
spa     264
rus      65
slv       4
ara       1
Name: count, dtype: int64

In [79]:
true_df["clusterId"].value_counts()

clusterId
wn-105    171
wn-103    144
wn-66      74
wn-149     73
wn-84      63
         ... 
wn-184      2
wn-190      2
wn-126      1
wn-128      1
wn-120      1
Name: count, Length: 237, dtype: int64

## Save Evaluation Results

In [80]:
save_df_to_csv(true_df, FILE_NAME)

In [81]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Sport Climbing

In [82]:
FILE_NAME = "olympic_games__japan__sport_climbing.csv"

In [83]:
df, events = load_events(FILE_NAME, processed_uris)

In [84]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
7,0,"""Ich muss mir selbst nichts mehr beweisen"" - J...",Mit den diesjährigen Olympischen Spielen schre...,deu,fm4.ORF.at,2021-07-03 08:28:00,https://fm4.orf.at/stories/3016198/,6631440377,,"['olympic_games', 'japan', 'sport_climbing']",wn-3
9,1,"More than 50 years later, a much different Oly...",Yet the most obvious -- and troubling -- diffe...,eng,The Boston Globe,2021-07-03 20:24:00,https://www.bostonglobe.com/2021/07/03/sports/...,6632060791,eng-6912945,"['olympic_games', 'japan', 'sport_climbing']",wn-5
8,2,Megos: Mit Platz zwei nach Tokio - Lead-Weltcu...,Im zweiten Lead-Weltcup des Jahres (Klettern a...,deu,Kicker online,2021-07-03 23:46:00,https://www.kicker.de/megos-mit-platz-zwei-nac...,6632214888,,"['olympic_games', 'japan', 'sport_climbing']",wn-4
11,3,Olympic Games Tokyo 2020 - The Official Video ...,Following cues from the Mario and Sonic at the...,eng,TheXboxHub,2021-07-05 05:49:00,https://www.thexboxhub.com/olympic-games-tokyo...,6633461242,,"['olympic_games', 'japan', 'sport_climbing']",wn-7
10,4,Record-breaking Aussie Olympic team locked in,The Australian Olympic Committee has locked-in...,eng,wwos.nine.com.au,2021-07-05 07:46:00,https://wwos.nine.com.au/olympics/tokyo-olympi...,6633535660,eng-6916693,"['olympic_games', 'japan', 'sport_climbing']",wn-6
...,...,...,...,...,...,...,...,...,...,...,...
1115,750,Corporate sponsorships for future Olympics exp...,"""One of the changes likely to occur after the ...",eng,Japan Today,2021-08-16 02:09:00,https://japantoday.com/category/features/kuchi...,6689473129,,"['olympic_games', 'japan', 'sport_climbing']",wn-561
1114,751,Inspiring hope,As the Tokyo 2020 Olympic Games drew to a clos...,eng,China Daily Asia,2021-08-16 07:58:00,https://www.chinadailyasia.com/article/233660,6689665521,eng-7023704,"['olympic_games', 'japan', 'sport_climbing']",wn-560
1119,752,Olympics latest: Paralympics to be held withou...,"TOKYO -- After two weeks, the sporting events ...",eng,Nikkei Asia,2021-08-16 15:42:00,https://asia.nikkei.com/Spotlight/Tokyo-2020-O...,6690147271,eng-7026329,"['olympic_games', 'japan', 'sport_climbing']",wn-565
1118,753,Roland Hönig: Olympia war ein riesengroßes Abe...,Viktoria Wolffhardt war nicht der einzige Tull...,deu,m.noen.at,2021-08-18 08:11:00,https://m.noen.at/tulln/fuer-orf-in-tokio-rola...,6692440152,,"['olympic_games', 'japan', 'sport_climbing']",wn-564


In [85]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-10      None        : 6634010041  deu  2021-07-05 16:59:00 - Olympia 2021: Diese Sportarten sind in Tokio dabei
2     :  wn-10      None        : 6634187876  deu  2021-07-05 19:14:00 - Olympia 2021 in Tokio: Diese Sportarten sind neu dabei
3     :  wn-10      None        : 6635213080  deu  2021-07-06 13:57:00 - Olympia 2021: Infos zu Tokio 2021 - Zeitplan und Disziplinen - WELT
-------------------------------------------------------------------
4     :  wn-18      eng-6919358 : 6638375053  eng  2021-07-08 14:52:00 - Spectators at Tokyo Olympics May Be Barred Amid New Covid Emergency
5     :  wn-18      eng-6919358 : 6638448752  eng  2021-07-08 15:47:00 - Tokyo Olympics will be held under a state of emergency as Japan mulls opening ceremony fan ban
-------------------------------------------------------------------
6     :  wn-20      eng-6928784 : 6642275749  eng  2021-07-11 14:14:00 - Tokyo Olympics 2020: 

Check the values of the specific record

In [86]:
df[df["uri"] == "6628981713"].to_dict("records")

[]

In [87]:
# ended at row: completed
selected_cls = [
    "wn-18",
    "wn-22",
    "wn-18",
    "wn-317",
    "wn-328",
    "wn-327",
    "wn-306",
    "wn-345",
    "wn-346",
    "wn-371",
    "wn-372",
    "wn-377",
    "wn-373",
    "wn-391",
    "wn-394",
    "wn-399",
    "wn-407",
    "wn-437",
    "wn-465",
    "wn-474",
    "wn-497",
    "wn-501",
    "wn-504",
    "wn-515",
    "wn-492",
    "wn-518",
    "wn-538",
]

In [88]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-492": "wn-497",
        "wn-515": "wn-497",
        "wn-474": "wn-407",
        "wn-465": "wn-407",
    },
    "art_to_cls": {

    },
    "remove_cls": [

    ],
    "remove_art": [
        "6674541384",
        "6677797559",
        "6678297272",
        "6680048938",
    ]
}

## True Labels (Manual) Preparation

In [89]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [90]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
26,12,Spectators at Tokyo Olympics May Be Barred Ami...,TOKYO -- The Japanese government declared a ne...,eng,The New York Times,2021-07-08 12:52:00,https://www.nytimes.com/2021/07/08/world/asia/...,6638375053,eng-6919358,"['olympic_games', 'japan', 'sport_climbing']",wn-1
27,13,Tokyo Olympics will be held under a state of e...,The pandemic-delayed Tokyo 2020 Olympics will ...,eng,MSN International Edition,2021-07-08 13:47:00,https://www.msn.com/en-us/news/world/tokyo-oly...,6638448752,eng-6919358,"['olympic_games', 'japan', 'sport_climbing']",wn-1
35,21,"Rock by rock, climbing set to scale Olympic wa...",Sport climbing summited the Olympic wall for T...,eng,CBC News,2021-07-12 12:38:00,https://www.cbc.ca/sports/olympics/summer/spor...,6643491279,eng-6935011,"['olympic_games', 'japan', 'sport_climbing']",wn-2
36,30,Sport climbing at Tokyo 2021: the new Olympic ...,We take an in-depth look at one of the five ne...,eng,The Telegraph,2021-07-13 11:50:00,https://www.telegraph.co.uk/olympics/0/sport-c...,6644928333,eng-6935011,"['olympic_games', 'japan', 'sport_climbing']",wn-2
552,356,Olympics Latest: Slovakia wins gold in women's...,Olympics Latest: Slovakia wins gold in women's...,eng,"The China Post, Taiwan",2021-07-29 10:54:00,https://chinapost.nownews.com/20210729-2676635,6666840728,eng-6977955,"['olympic_games', 'japan', 'sport_climbing']",wn-3
...,...,...,...,...,...,...,...,...,...,...,...
1077,714,Ob 21.00 na Kongresnem trgu sprejem za slovens...,Še pred tem bosta olimpijsko reprezentanco spr...,slv,MMC RTV Slovenija,2021-08-10 09:17:00,https://www.rtvslo.si/sport/oi-2020/ob-21-00-n...,6682186259,slv-79784,"['olympic_games', 'japan', 'sport_climbing']",wn-22
1078,719,"Sprejem za olimpijce, na Kongresnem trgu testi...",Še pred tem bosta olimpijsko reprezentanco spr...,slv,MMC RTV Slovenija,2021-08-10 12:58:00,https://www.rtvslo.si/sport/oi-2020/sprejem-za...,6682421394,slv-79784,"['olympic_games', 'japan', 'sport_climbing']",wn-22
1079,725,Sprejem za olimpijce na Kongresnem trgu ob 21.00,"Na Japonsko je odpotovalo 54 športnikov, olimp...",slv,MMC RTV Slovenija,2021-08-10 22:04:00,https://www.rtvslo.si/sport/oi-2020/sprejem-za...,6683085438,slv-79801,"['olympic_games', 'japan', 'sport_climbing']",wn-22
1080,727,Sprejem olimpijcev na Kongresnem trgu,"Na Japonsko je odpotovalo 54 športnikov, olimp...",slv,MMC RTV Slovenija,2021-08-10 23:11:00,https://www.rtvslo.si/sport/oi-2020/sprejem-ol...,6683150348,slv-79801,"['olympic_games', 'japan', 'sport_climbing']",wn-22


## Subset Evaluation

### Clustering Evaluation

In [91]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [92]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.7411944869831546,
 'P': 1.0,
 'R': 0.5888077858880778,
 'A': 0.9665346534653465}

#### Evaluation using Event Registry

In [93]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.5102319236016372,
 'P': 0.5807453416149069,
 'R': 0.45498783454987834,
 'A': 0.928910891089109}

### Statistics

In [94]:
true_df["lang"].value_counts()

lang
eng    63
slv    27
deu    11
Name: count, dtype: int64

In [95]:
true_df["clusterId"].value_counts()

clusterId
wn-16    19
wn-18    17
wn-14    10
wn-22     5
wn-13     4
wn-19     4
wn-12     4
wn-8      3
wn-9      3
wn-10     3
wn-11     3
wn-7      3
wn-17     3
wn-20     3
wn-21     3
wn-2      2
wn-6      2
wn-15     2
wn-5      2
wn-4      2
wn-3      2
wn-1      2
Name: count, dtype: int64

## Save Evaluation Results

In [96]:
save_df_to_csv(true_df, FILE_NAME)

In [97]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Swimming

In [98]:
FILE_NAME = "olympic_games__japan__swimming.csv"

In [100]:
df, events = load_events(FILE_NAME, processed_uris)

In [101]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
11,0,Los 'CONs' tenían hasta el 5 de julio para Ins...,"Panamá. 2 de Judo. 1Ciclista, 1 Boxeadora, 3 d...",spa,La Estrella de Panamá,2021-07-01 09:05:00,https://www.laestrella.com.pa/deportes/cocteld...,6628473299,,"['olympic_games', 'japan', 'swimming']",wn-7
7,1,Olimpíadas: nadadora Viviane Jungblut supera c...,"No final de abril, quando a Confederação Brasi...",por,ISTOÉ Independente,2021-07-01 15:16:00,https://istoe.com.br/olimpiadas-nadadora-vivia...,6628941788,,"['olympic_games', 'japan', 'swimming']",wn-6
8,2,Olimpíadas: nadadora Viviane Jungblut supera c...,"No final de abril, quando a Confederação Brasi...",por,News Rondonia,2021-07-01 16:13:00,https://www.newsrondonia.com.br/noticia/176678...,6629020742,,"['olympic_games', 'japan', 'swimming']",wn-6
9,3,Olimpíadas: nadadora Viviane Jungblut supera c...,"No final de abril, quando a Confederação Brasi...",por,Gazeta Digital,2021-07-01 18:08:00,https://www.gazetadigital.com.br/editorias/esp...,6629191697,,"['olympic_games', 'japan', 'swimming']",wn-6
5,4,'Gracias por meterme en la pile': el tierno me...,Christian Eriksen: cómo sigue la recuperación ...,spa,Clarin,2021-07-01 22:54:00,https://www.clarin.com/deportes/-gracias-meter...,6629554655,,"['olympic_games', 'japan', 'swimming']",wn-4
...,...,...,...,...,...,...,...,...,...,...,...
1408,1199,Alejandra Aybar y un sueño por cumplir: conoce...,"""No tengo palabras para explicar lo importante...",spa,Récord,2021-08-18 01:40:00,https://record.acento.com.do/polideportivo/ale...,6692175621,,"['olympic_games', 'japan', 'swimming']",wn-767
1407,1200,Rommel Pacheco: Su nueva vida entre la curul y...,Leer más: Rommel Pacheco se despide de los Jue...,spa,Life and Style,2021-08-18 03:32:00,https://lifeandstyle.expansion.mx/entretenimie...,6692256275,spa-2462754,"['olympic_games', 'japan', 'swimming']",wn-766
1414,1201,"Le sexe est vital pour la puissance explosive,...",La triple médaillée d'or olympique cérébrale A...,fra,News 24,2021-08-18 20:24:00,https://news-24.fr/le-sexe-est-vital-pour-la-p...,6693282687,,"['olympic_games', 'japan', 'swimming']",wn-770
1415,1202,"Бубка, Кличко и все сильнейшие, уникальные рек...",Выдающийся украинский рекордсмен - Сергей Бубк...,rus,24 Канал,2021-08-19 13:31:00,https://sport.24tv.ua/ru/bubka-klichko-vse-sil...,6694182507,,"['olympic_games', 'japan', 'swimming']",wn-771


In [102]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-6       None        : 6628941788  por  2021-07-01 17:16:00 - Olimpíadas: nadadora Viviane Jungblut supera covid-19 e vai aos Jogos - ISTOÉ Independente
2     :  wn-6       None        : 6629020742  por  2021-07-01 18:13:00 - Olimpíadas: nadadora Viviane Jungblut supera covid-19 e vai aos Jogos - News Rondonia Portal de Noticias
3     :  wn-6       None        : 6629191697  por  2021-07-01 20:08:00 - Olimpíadas: nadadora Viviane Jungblut supera covid-19 e vai aos Jogos | Gazeta Digital
4     :  wn-6       None        : 6629555820  por  2021-07-02 00:55:00 - Nadadora Viviane Jungblut supera Covid-19 e vai aos Jogos Olímpicos
-------------------------------------------------------------------
5     :  wn-22      None        : 6634671737  spa  2021-07-06 02:34:00 - Estos son los y las atletas que representarán a México en Tokio 2020
6     :  wn-22      None        : 6634760730  spa  2021-07-06 04:31:00 - Conoce

Check the values of the specific record

In [103]:
df[df["uri"] == "6670609453"].to_dict("records")

[{'id': 852,
  'title': 'Dressel brilha no revezamento e leva 5º ouro em Tóquio; australiana faz história',
  'body': 'A- A+ Forte candidato a estrela dos Jogos Olímpicos, o americano Caeleb Dressel não decepcionou na piscina do Centro Aquático de Tóquio. No último dia da natação no Japão, na noite deste sábado (pelo horário brasileiro), ele conquistou mais dois ouros, totalizando cinco nesta Olimpíada. Também brilhou a australiana Emma McKeon, agora dona de sete medalhas nestes Jogos. Com cinco ouros em apenas uma edição da Olimpíada, Dressel entrou para a restrita lista de americanos com cinco títulos olímpicos numa mesma edição dos Jogos. Somente outros quatro atletas obtiveram tal feito, como os também nadadores Michael Phelps (fez isso três vezes), Mark Spitz e Matt Biondi e o patinador de velocidade Eric Heiden. Somando as conquistas do Rio-2016, em sua estreia em Olimpíadas, Dressel soma agora sete ouros. Curiosamente, toda vez que subiu ao pódio foi para figurar na posição mais

In [104]:
# ended at row: completed
selected_cls = [
    "wn-28",
    "wn-30",
    "wn-54",
    "wn-55",
    "wn-90",
    "wn-94",
    "wn-128",
    "wn-129",
    "wn-144",
    "wn-171",
    "wn-174",
    "wn-230",
    "wn-235",
    "wn-232",
    "wn-267",
    "wn-270",
    "wn-309",
    "wn-308",
    "wn-331",
    "wn-333",
    "wn-359",
    "wn-360",
    "wn-366",
    "wn-365",
    "wn-391",
    "wn-401",
    "wn-410",
    "wn-412",
    "wn-413",
    "wn-434",
    "wn-467",
    "wn-476",
    "wn-558",
    "wn-485",
    "wn-495",
    "wn-505",
    "wn-513",
    "wn-544",
    "wn-546",
    "wn-548",
    "wn-568",
    "wn-567",
    "wn-562",
    "wn-575",
    "wn-582",
    "wn-592",
    "wn-591",
    "wn-610",
    "wn-613",
    "wn-691",
    "wn-697",
    "wn-703",
    "wn-699",
]

In [105]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-558": "wn-476",
        "wn-331": "wn-308",
    },
    "art_to_cls": {
        "6667315882": "wn-XXXXX1",
    },
    "remove_cls": [

    ],
    "remove_art": [

    ]
}

## True Labels (Manual) Preparation

In [106]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df["clusterId"] = sample_df["clusterId"].map(cluster_mapping)


In [107]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
37,27,On target: Driving force behind S.Korea's Olym...,Arrows away: South Korean archer Kim Je-deok (...,eng,Daily Mail Online,2021-07-07 10:26:00,https://www.dailymail.co.uk/wires/afp/article-...,6636645175,eng-6924250,"['olympic_games', 'japan', 'swimming']",wn-1
38,28,Tokyo Olympics 2020: What drives South Korea's...,"Goesan, South Korea: Bungee-jumping, baseball ...",eng,Firstpost,2021-07-07 11:13:00,https://www.firstpost.com/sports/tokyo-olympic...,6636697737,eng-6924250,"['olympic_games', 'japan', 'swimming']",wn-1
42,29,"Lana, 15 ans, espoir de médaille pour la Bosnie",La nageuse rêve de remporter la première médai...,fra,L'essentiel,2021-07-07 13:02:00,http://www.lessentiel.lu/fr/sports/autres_spor...,6636835429,,"['olympic_games', 'japan', 'swimming']",wn-2
43,30,"Bosnie: Lana Pudar, 15 ans, n'a pas de piscine...","Dans sa ville de Mostar, Lana Pudar, 15 ans, n...",fra,TV5MONDE,2021-07-07 13:37:00,https://information.tv5monde.com/info/bosnie-l...,6636883322,,"['olympic_games', 'japan', 'swimming']",wn-2
39,31,What is the driving force behind South Korea's...,South Korean archers have ruled the sport for ...,eng,Scroll.in,2021-07-07 14:05:00,https://scroll.in/field/999546/what-is-the-dri...,6636922379,eng-6924250,"['olympic_games', 'japan', 'swimming']",wn-1
...,...,...,...,...,...,...,...,...,...,...,...
1314,1131,Medallero de los Juegos Olímpicos de Tokio: as...,Estados Unidos se llevó finalmente el pulso co...,spa,El Confidencial,2021-08-09 07:15:00,https://www.elconfidencial.com/deportes/juegos...,6680744884,spa-2450343,"['olympic_games', 'japan', 'swimming']",wn-52
1324,1132,Los 10 impactos de Tokio,La edición 32 de la cita olímpica llegó a su f...,spa,Diario La Gaceta,2021-08-09 07:25:00,https://www.lagaceta.com.ar/nota/905801/deport...,6680750719,,"['olympic_games', 'japan', 'swimming']",wn-51
1312,1136,"Mismas medallas, pero menos oros",La delegación española se marcha de Tokio con ...,spa,Diari de Tarragona,2021-08-09 09:53:00,https://www.diaridetarragona.com/cat-es-mon/Mi...,6680859656,spa-2448489,"['olympic_games', 'japan', 'swimming']",wn-52
1300,1143,A medida que se cierran los Juegos Olímpicos d...,El caldero se apagó el domingo en los agotador...,spa,Nuevo Periodico,2021-08-09 15:31:00,https://nuevoperiodico.com/a-medida-que-se-cie...,6681231886,spa-2450270,"['olympic_games', 'japan', 'swimming']",wn-50


## Subset Evaluation

### Clustering Evaluation

In [108]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [109]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.9548802946593002,
 'P': 0.9980750721847931,
 'R': 0.9152691968225949,
 'A': 0.9973988055739881}

#### Evaluation using Event Registry

In [110]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.4860088365243004,
 'P': 0.4169298799747315,
 'R': 0.5825242718446602,
 'A': 0.9629462508294625}

### Statistics

In [111]:
true_df["lang"].value_counts()

lang
spa    129
por    106
eng     31
fra      9
Name: count, dtype: int64

In [112]:
true_df["clusterId"].value_counts()

clusterId
wn-31    23
wn-24    19
wn-20    17
wn-14    14
wn-48    13
wn-52    11
wn-36     9
wn-34     8
wn-50     8
wn-45     7
wn-41     7
wn-12     7
wn-15     7
wn-51     7
wn-47     6
wn-49     5
wn-30     5
wn-39     5
wn-27     5
wn-18     5
wn-1      4
wn-6      4
wn-46     4
wn-13     4
wn-33     4
wn-16     4
wn-25     4
wn-22     4
wn-2      3
wn-28     3
wn-4      3
wn-5      3
wn-26     3
wn-35     3
wn-21     3
wn-10     3
wn-43     2
wn-3      2
wn-7      2
wn-8      2
wn-44     2
wn-19     2
wn-42     2
wn-9      2
wn-40     2
wn-38     2
wn-37     2
wn-17     2
wn-29     2
wn-23     2
wn-11     2
wn-32     1
Name: count, dtype: int64

## Save Evaluation Results

In [113]:
save_df_to_csv(true_df, FILE_NAME)

In [114]:
processed_uris = processed_uris | set(true_df["uri"].to_list())

# Olympic Games - Japan - Table Tennis

In [115]:
FILE_NAME = "olympic_games__japan__table_tennis.csv"

In [116]:
df, events = load_events(FILE_NAME, processed_uris)

In [117]:
df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
2,0,Sathiyan Gnanasekaran's 17-year quest for Olym...,Sathiyan Gnanasekaran was first drawn to the O...,eng,Firstpost,2021-07-01 08:22:00,https://www.firstpost.com/sports/sathiyan-gnan...,6628434003,,"['olympic_games', 'japan', 'table_tennis']",wn-2
3,1,Cameroon-Info.Net,L'équipe du Cameroun sera représentée par 12 a...,fra,Cameroon-Info.Net,2021-07-01 14:56:00,http://www.cameroon-info.net/article/cameroun-...,6628911930,,"['olympic_games', 'japan', 'table_tennis']",wn-3
5,2,USTA Names Tokyo U.S. Olympic & Paralympic Ten...,"WHITE PLAINS, N.Y., July 1, 2021 -- The USTA, ...",eng,Tennis Industry Magazine,2021-07-01 18:40:00,http://www.tennisindustrymag.com/news/2021/07/...,6629237155,eng-6907570,"['olympic_games', 'japan', 'table_tennis']",wn-4
9,3,IOA likely to declare Tokyo Olympics flag bear...,"New Delhi [India], July 1 (ANI): With the Toky...",eng,Asian News International (ANI),2021-07-01 21:39:00,https://www.aninews.in/news/sports/others/ioa-...,6629469036,eng-6911655,"['olympic_games', 'japan', 'table_tennis']",wn-6
10,4,Tokyo Games: IOA Likely To Declare Name Of Ind...,"With the Tokyo Games round the corner, the Ind...",eng,NDTVSports.com,2021-07-01 23:03:00,https://sports.ndtv.com/olympics/tokyo-games-i...,6629562962,eng-6911655,"['olympic_games', 'japan', 'table_tennis']",wn-6
...,...,...,...,...,...,...,...,...,...,...,...
1771,1510,Slovenci na paraolimpijske igre v Tokiu z meša...,"Tokio, 20. avgusta (STA) - Le še nekaj dni loč...",slv,STA d.o.o.,2021-08-20 12:03:00,http://www.sta.si/http.php?id=2934265,6695479651,,"['olympic_games', 'japan', 'table_tennis']",wn-845
1772,1511,Slovenci na paralimpijske igre z mešanico izku...,"Tekmovanje, ki bo dobra dva tedna po olimpijsk...",slv,Delo,2021-08-20 12:18:00,https://www.delo.si/sport/tokio-2020/slovenci-...,6695496860,,"['olympic_games', 'japan', 'table_tennis']",wn-845
1780,1512,Шеф миссии команды ПКР Рожков: верим в достойн...,Шеф миссии сборной команды России на Паралимпи...,rus,Спорт РИА Новости,2021-08-20 12:28:00,https://rsport.ria.ru/20210820/intervyu-174650...,6695509235,,"['olympic_games', 'japan', 'table_tennis']",wn-848
1773,1513,Slovenski paraolimpijci z izkušnjami in mlados...,"Tekmovanje, ki bo dobra dva tedna po olimpijsk...",slv,MMC RTV Slovenija,2021-08-20 12:30:00,https://www.rtvslo.si/sport/paraolimpijske-igr...,6695511518,,"['olympic_games', 'japan', 'table_tennis']",wn-845


In [118]:
print_events(events, min_articles=2)

-------------------------------------------------------------------
1     :  wn-3       None        : 6628911930  fra  2021-07-01 16:56:00 - Cameroon-Info.Net
2     :  wn-3       None        : 6630067138  fra  2021-07-02 11:37:00 - Tokyo 2020 : Le Pm remet le drapeau à la Cameroon olympic team
-------------------------------------------------------------------
3     :  wn-6       eng-6911655 : 6629469036  eng  2021-07-01 23:39:00 - IOA likely to declare Tokyo Olympics flag bearers name by July 5
4     :  wn-6       eng-6911655 : 6629562962  eng  2021-07-02 01:03:00 - Tokyo Games: IOA Likely To Declare Name Of India's Flag Bearers For Olympics By July 5, Says Sources | Olympics News
5     :  wn-6       eng-6911655 : 6629760568  eng  2021-07-02 04:37:00 - Tokyo Games: IOA likely to announce name of Indian Olympic Games flag bearers by July 5, sources say | ExBulletin
-------------------------------------------------------------------
6     :  wn-8       rus-1053745 : 6629927617  rus  202

Check the values of the specific record

In [119]:
df[df["uri"] == "1425194654"].to_dict("records")

[]

In [120]:
# ended at row: completed
selected_cls = [
    "wn-6",
    "wn-8",
    "wn-13",
    "wn-18",
    "wn-53",
    "wn-108",
    "wn-111",
    "wn-124",
    "wn-137",
    "wn-132",
    "wn-150",
    "wn-168",
    "wn-179",
    "wn-178",
    "wn-176",
    "wn-184",
    "wn-188",
    "wn-192",
    "wn-195",
    "wn-181",
    "wn-199",
    "wn-205",
    "wn-241",
    "wn-231",
    "wn-253",
    "wn-276",
    "wn-279",
    "wn-291",
    "wn-294",
    "wn-306",
    "wn-311",
    "wn-314",
    "wn-315",
    "wn-324",
    "wn-364",
    "wn-378",
    "wn-376",
    "wn-420",
    "wn-428",
    "wn-451",
    "wn-458",
    "wn-470",
    "wn-511",
    "wn-517",
    "wn-571",
    "wn-603",
    "wn-612",
    "wn-620",
    "wn-643",
    "wn-645",
    "wn-682",
    "wn-747",
    "wn-758",
    "wn-795",

]

In [121]:
# copy this for every dataset separately
manual_corr = {
    "cls_to_cls": {
        "wn-758": "wn-747",
        "wn-276": "wn-179",
        "wn-199": "wn-179",
        "wn-176": "wn-179",
        "wn-184": "wn-179",
        "wn-178": "wn-179",
    },
    "art_to_cls": {
        "6658024584": "wn-XXXXX1",

        "6671187828": "wn-XXXXX2",
        "6671274940": "wn-XXXXX2",
        "6671317065": "wn-XXXXX2",

    },
    "remove_cls": [

    ],
    "remove_art": [
        "6658152000",
        "6663130390",
        "6663963629",
        "6664096693",
        "6664720185",
    ]
}

## True Labels (Manual) Preparation

In [122]:
true_df = prepare_sample_df(df, selected_cls, manual_corr)

In [123]:
true_df

Unnamed: 0,id,title,body,lang,source,dateTime,url,uri,eventUri,concepts,clusterId
9,3,IOA likely to declare Tokyo Olympics flag bear...,"New Delhi [India], July 1 (ANI): With the Toky...",eng,Asian News International (ANI),2021-07-01 21:39:00,https://www.aninews.in/news/sports/others/ioa-...,6629469036,eng-6911655,"['olympic_games', 'japan', 'table_tennis']",wn-1
10,4,Tokyo Games: IOA Likely To Declare Name Of Ind...,"With the Tokyo Games round the corner, the Ind...",eng,NDTVSports.com,2021-07-01 23:03:00,https://sports.ndtv.com/olympics/tokyo-games-i...,6629562962,eng-6911655,"['olympic_games', 'japan', 'table_tennis']",wn-1
11,5,Tokyo Games: IOA likely to announce name of In...,"With the Tokyo Games just around the corner, t...",eng,ExBulletin,2021-07-02 02:37:00,https://exbulletin.com/sports/1032380/,6629760568,eng-6911655,"['olympic_games', 'japan', 'table_tennis']",wn-1
13,7,95 спортсменов представят Казахстан на Олимпиа...,НУР-СУЛТАН. КАЗИНФОРМ - Казахстанские спортсме...,rus,kazinform,2021-07-02 06:42:00,https://www.inform.kz/ru/article/3807662,6629927617,rus-1053745,"['olympic_games', 'japan', 'table_tennis']",wn-2
14,8,Сколько спортсменов представят Казахстан на Ол...,Казахстанские спортсмены завоевали 96 олимпийс...,rus,Zakon.kz,2021-07-02 07:03:00,https://www.zakon.kz/5074473-skolko-sportsmeno...,6629941702,rus-1053745,"['olympic_games', 'japan', 'table_tennis']",wn-2
...,...,...,...,...,...,...,...,...,...,...,...
1618,1388,Chancenlos gegen China,Silber hatten die Tischtennis-Männer aus Deuts...,deu,www.prosieben.de,2021-08-06 20:00:00,https://www.prosieben.de/tv/newstime/sport/cha...,6678248145,deu-1391590,"['olympic_games', 'japan', 'table_tennis']",wn-49
1604,1400,Tischtennis-Mannschaftsfinale: Ihr letzter Ver...,Das deutsche Tischtennistrio fordert im Finale...,deu,Yahoo!,2021-08-07 03:22:00,https://de.nachrichten.yahoo.com/tischtennis-m...,6678631343,deu-1391590,"['olympic_games', 'japan', 'table_tennis']",wn-49
1619,1401,"Silber hatten die Tischtennis-Herren sicher, a...",Tokio - Die erhoffte Goldmedaille durften sich...,deu,Ad Hoc News,2021-08-07 05:06:00,https://www.ad-hoc-news.de/sport/tennis/silber...,6678690352,,"['olympic_games', 'japan', 'table_tennis']",wn-49
1698,1430,"""Pričakujem, da bom resnično užival v namiznem...",Slabe tri tedne pred začetkom paralimpijskih i...,slv,Svet24.si - Vsa resnica na enem mestu,2021-08-08 20:54:00,https://novice.svet24.si/clanek/novice/sloveni...,6680386538,,"['olympic_games', 'japan', 'table_tennis']",wn-50


## Subset Evaluation

### Clustering Evaluation

In [124]:
pred_df = df[df["uri"].isin(true_df["uri"].to_list())]

#### Evaluation using the new methodology

In [125]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "clusterId")
measure_performance(articles)

{'F1': 0.8558376998522101,
 'P': 0.9943802684982829,
 'R': 0.7511792452830188,
 'A': 0.9849409849409849}

#### Evaluation using Event Registry

In [126]:
articles = prepare_predicts(true_df, pred_df, "clusterId", "eventUri")
measure_performance(articles)

{'F1': 0.4588546679499519,
 'P': 0.4683202357563851,
 'R': 0.4497641509433962,
 'A': 0.9368728334245575}

### Statistics

In [127]:
true_df["lang"].value_counts()

lang
eng    138
por    100
deu     73
slv     25
fra     23
rus     19
Name: count, dtype: int64

In [128]:
true_df["clusterId"].value_counts()

clusterId
wn-13    75
wn-48    23
wn-49    21
wn-40    18
wn-27    17
wn-34    17
wn-36    11
wn-38    11
wn-46     9
wn-14     9
wn-6      9
wn-18     9
wn-3      9
wn-29     9
wn-45     8
wn-28     8
wn-12     8
wn-10     7
wn-16     7
wn-8      6
wn-7      6
wn-2      5
wn-35     5
wn-44     5
wn-23     5
wn-17     5
wn-15     5
wn-25     3
wn-21     3
wn-30     3
wn-31     3
wn-33     3
wn-47     3
wn-1      3
wn-42     2
wn-43     2
wn-41     2
wn-39     2
wn-50     2
wn-32     2
wn-24     2
wn-22     2
wn-20     2
wn-19     2
wn-11     2
wn-9      2
wn-5      2
wn-4      2
wn-37     1
wn-26     1
Name: count, dtype: int64

## Save Evaluation Results

In [129]:
save_df_to_csv(true_df, FILE_NAME)

In [130]:
processed_uris = processed_uris | set(true_df["uri"].to_list())