## The Purpose of This NoteBook

Please make sure you've already run the **get_candidate_labels** notebook to extract candidate labels using scispacy and our custom algorithm before running this one.

This notebook aim for create training/validation data for training model. We devided all content of the paper into multiple chunks with fixed number of words. If one chunk contains any dataset title from the given training set or our extra labels (from **get_candidate_labels** notebook), we consider that chunk as **positive** sample otherwise, it's a **negative** sample.

The labels can be divided into three categories (recognized based on the given dataset labels):

    1. LONG FORM
    2. LONG FORM (SHORT FORM)
    3. SHORT FORM
    
So, in the training set, if we have a dataset label in the form **"LONG FORM (SHORT FORM)"**, we should add **"LONG FORM"** and **"SHORT FORM"** into the training labels also. If the label only matches the form **"LONG FORM"**, we try to find its short form and add it into training labels.

To improve the preciseness of the finding dataset process, we found a clean version of the given training labels from a clean paper and returned a (start, end) index then got the original dataset label from a raw paper. For example, we can get the label "National Study-of Youth" in the raw paper if our training label contains "national study of youth". 

After finding dataset process complete, we will remove a sample in below cases:

    1. Its found dataset label is lower
    2. Its found dataset label contains the labels that come from both train and valid labels. Note that a single sample can contain multiple dataset labels.

In [None]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from transformers import *
import pickle
from pqdm.processes import pqdm
import re
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()

In [None]:
settings = json.load(open("../settings.json", "rb"))

for k, v in settings.items():
    settings[k] = "." + v

In [None]:
df = pd.read_csv(f"{settings['RAW_DATA_DIR']}/train.csv")
df.drop_duplicates("Id", keep="first", inplace=True)

In [None]:
def generate_s_e_window_sliding(sample_len, win_size, step_size):
    start = 0
    end = win_size
    s_e = []
    s_e.append([start, end])
    while end < sample_len:
        start += step_size
        end = start + win_size
        s_e.append([start, end])

    s_e[-1][0] -= s_e[-1][1] - sample_len
    s_e[-1][0] = max(s_e[-1][0], 0)
    s_e[-1][1] = sample_len
    return s_e

In [None]:
def custom_clean_text(txt, is_lower=True):
    if is_lower:
        return re.sub('[^A-Za-z0-9]\(\)', ' ', str(txt).lower())
    else:
        return re.sub('[^A-Za-z0-9]', ' ', str(txt))
    

def clean_text(txt, is_lower=True):
    if is_lower:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
    else:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

In [None]:
def jaccard_similarity(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# This is a ACRONYM MAPPING that we built for training dataset.
# A acronym (short form) will be use for training progress.

ACRONYM_MAPPING = {'adni': "alzheimer's disease neuroimaging initiative",
 'blsa': 'baltimore longitudinal study of aging',
 'cord-19': 'covid-19 open research dataset',
 'coa': 'census of agriculture',
 'charybdis': 'characterizing health associated risks and your baseline disease in sars cov 2',
 'ccd': 'nces common core of data',
 'cccsl': 'complexity science hub covid 19 control strategies list',
 'c-cap': 'coastal change analysis program',
 'nwlon': 'noaa national water level observation network',
 'slosh': 'noaa sea lake and overland surges from hurricanes',
 'ibtracs': 'international best-track archive for climate stewardship',
 'oisst': 'optimum interpolation sea surface temperature',
 'ruccs': 'rural-urban continuum codes',
 'bbs': 'north american breeding bird survey',
 'agid': 'aging integrated database',
 'niagads': 'the national institute on aging genetics of alzheimer s disease data storage site',
 'arm': 'agricultural resources management survey',
 'b&b': 'baccalaureate and beyond longitudinal study',
 'ecls': 'early childhood longitudinal study',
 'ecls-b': 'early childhood longitudinal study-birth',
 'peels': 'pre-elementary education longitudinal study',
 'nlts': 'national longitudinal transition study',
 'nlts2': 'national longitudinal transition study-2',
 'nels': 'national education longitudinal studies',
 'hsls': 'high school longitudinal study',
 'naep': 'national assessment of education progress',
 'wod': 'noaa world ocean database',
 'sdr': 'survey of doctorate recipients',
 'isdr': 'international survey of doctoral recipients',
 'sed': 'survey of earned doctorates',
 'sird': 'survey of industrial research and development',
 'ntps': 'national teacher and principal survey',
 'piaac': 'program for the international assessment of adult competencies',
 'ricord': 'rsna international covid 19 open radiology database',
 'ssocs': 'school survey on crime and safety',
 'timss': 'trends in international mathematics and science study',
 'cels': 'citizenship education longitudinal study',
 'kels': 'korea education longitudinal study',
 'gels': 'gerontology education longitudinal study',
 'nlms': 'national longitudinal mortality study',
 'nshd': 'national survey of health and development',
 'rhsa': 'rural high school aspirations study',
 'sodb': 'southern ocean database',
 'clsa': "canadian longitudinal study of aging",
 'tlsa': "taiwan longitudinal study of aging",
 'plsa': 'polish longitudinal study of aging',
 'brdis': 'business research development and innovation survey',
 'woa/wod': 'world ocean atlas and world ocean database',
 'hses': 'high school effectiveness study',
 'nsdr': 'national survey of doctorate recipients',
 'seels': 'special education elementary longitudinal study',
 'decls': 'delaware early childhood longitudinal study',
 'ecls-k': 'early childhood longitudinal study-kindergarten',
 'ntps': 'national teacher principal survey',
 'wls': 'wisconsin longitudinal study',
 'arms': 'agricultural resource management survey'}

In [None]:
def remove_acronym(label):
    if "(" in label.split()[-1]:
        return " ".join(label.split()[:-1])
    return label

In [None]:
train_dataset_labels = [l.lower().strip() for l in df.dataset_label.unique()] + [clean_text(l).strip() for l in df.dataset_label.unique()]
train_dataset_labels += [remove_acronym(l.lower().strip()) for l in df.dataset_label.unique()]
train_dataset_labels += list(ACRONYM_MAPPING.keys())

In [None]:
train_extra_labels = pd.read_csv(f"{settings['RAW_DATA_DIR']}/extra_train_labels.csv")["label"].tolist()

In [None]:
all_train_labels = list(set(train_dataset_labels + train_extra_labels))

In [None]:
valid_extra_labels = pd.read_csv(f"{settings['RAW_DATA_DIR']}/extra_valid_labels.csv")["label"].tolist()
valid_extra_labels = [l.lower() for l in valid_extra_labels] + [clean_text(l).strip() for l in valid_extra_labels]
valid_extra_labels = [l for l in valid_extra_labels if l not in all_train_labels] # just to be sure non-overlap

In [None]:
all_posible_labels = train_dataset_labels + train_extra_labels + valid_extra_labels
all_posible_labels = list(set(all_posible_labels))

In [None]:
len(all_posible_labels)

In [None]:
def find_all_pred_in_text(normed_text, all_unique_preds):
    normed_text_lower = custom_clean_text(normed_text)
    clean_normed_text_lower = clean_text(normed_text_lower).strip()
    diff_len = abs(len(normed_text_lower) - len(normed_text))
    preds = []
    raw_preds = []
    preds_indexs = []
    for pred in all_unique_preds:
        if len(pred.split(" ")) <= 2:
            if (
                " " + pred + " " in normed_text_lower
                or " " + pred + "." in normed_text_lower
                or " " + pred + "," in normed_text_lower
                or " " + pred + ";" in normed_text_lower
            ) and pred != "":
                preds.append(pred)
        else:
            if pred in normed_text_lower:
                preds.append(pred)

    for pred in preds:
        if len(pred.split(" ")) <= 2:
            start_index = normed_text_lower.index(" " + pred)
            start_index += 1
        else:
            start_index = normed_text_lower.index(pred)

        raw_pred = normed_text[start_index : start_index + len(pred)]
        clean_raw_pred = clean_text(raw_pred).strip()
        clean_pred = clean_text(pred).strip()

        if clean_raw_pred != clean_pred:
            # caused by lower()
            found_true_label = False
            for shift_index in range(-diff_len - 1, diff_len + 1):
                raw_pred_candidate = normed_text[
                    start_index + shift_index : start_index + shift_index + len(pred)
                ]
                clean_raw_pred_candidate = clean_text(raw_pred_candidate).strip()
                if clean_raw_pred_candidate == clean_pred:
                    if len(raw_pred_candidate.split(" ")) <= 2:
                        if raw_pred_candidate.islower() is False:
                            preds_indexs.append(
                                [
                                    raw_pred_candidate,
                                    [
                                        start_index + shift_index,
                                        start_index + shift_index + len(pred),
                                    ],
                                ]
                            )
                            raw_preds.append(raw_pred_candidate)
                    else:
                        preds_indexs.append(
                            [
                                raw_pred_candidate,
                                [
                                    start_index + shift_index,
                                    start_index + shift_index + len(pred),
                                ],
                            ]
                        )
                        raw_preds.append(raw_pred_candidate)
                    found_true_label = True
                    break
        else:
            if len(raw_pred.split(" ")) <= 2:
                # acronym is not lower
                if raw_pred.islower() is False:
                    preds_indexs.append(
                        [raw_pred, [start_index, start_index + len(pred)]]
                    )
                    raw_preds.append(raw_pred)
            else:
                preds_indexs.append([raw_pred, [start_index, start_index + len(pred)]])
                raw_preds.append(raw_pred)

    group_idxs = []
    for i in range(len(preds_indexs)):
        for j in range(len(preds_indexs)):
            if i != j:
                start_i, end_i = preds_indexs[i][1]
                start_j, end_j = preds_indexs[j][1]
                if start_i <= end_j and end_i <= end_j and start_i >= start_j:
                    group_idxs.append([i, j])
    raw_preds = np.array(raw_preds)
    for group_idx in group_idxs:
        raw_preds[group_idx[0]] = raw_preds[group_idx[1]]
    return np.unique(raw_preds)

In [None]:
win_size = 200

def process(i):
    ids = []
    titles = []
    texts = []
    raw_texts = []
    labels = []
    pub_titles = []
    cleaned_labels = []
    row = df.iloc[i]
    x = json.load(open(f"{settings['RAW_DATA_DIR']}/train/{row.Id}.json","rt"))
    for section in x:
        raw_text = section["text"].replace("\n", " ")
        raw_text_encode = raw_text.split(" ")
        s_e = generate_s_e_window_sliding(len(raw_text_encode), win_size, int(0.75*win_size))
        for (s, e) in s_e:
            pub_titles.append(row.pub_title)
            raw_sent = " ".join(raw_text_encode[s:e]).strip()
            titles.append(section["section_title"])
            ids.append(row.Id)
            found_labels = find_all_pred_in_text(raw_sent, all_posible_labels)
            if len(found_labels) > 0:
                labels.append("|".join(found_labels))
            else:
                labels.append("")
            texts.append(raw_sent)
        
    results = {}
    results["id"] = ids
    results["pub_title"] = pub_titles
    results["title"] = titles
    results["text"] = texts
    results["label"] = labels
    return results

results = pqdm(list(range(len(df))), process, n_jobs=8)

In [None]:
ids = []
titles = []
texts = []
labels = []
pub_titles = []

for result in tqdm(results):
    ids.extend(result["id"])
    titles.extend(result["title"])
    texts.extend(result["text"])
    labels.extend(result["label"])
    pub_titles.extend(result["pub_title"])

In [None]:
train_df = pd.DataFrame()
train_df["id"] = ids
train_df["pub_title"] = pub_titles
train_df["title"] = titles
train_df["text"] = texts
train_df["label"] = labels

In [None]:
train_df["label"] = train_df["label"].apply(lambda x: x.strip())

In [None]:
train_df.drop_duplicates(subset=None,
                     keep = "first", inplace = True)

In [None]:
all_train_labels = train_df.label.unique()

In [None]:
bad_labels = []
for tl in all_train_labels:
    for l in tl.split("|"):
        if l.islower():
            bad_labels.append(l)

In [None]:
bad_labels = sorted(list(set(bad_labels)))

In [None]:
ignore_labels = ["Slosh", "aDNI", "Gels", "HSEs", "WLs", "iSDR",
                 "ADni", "adnI", "Naep", "ECLs", "hSLS", "PeeLS", "pLSA",
                 "Arms", "NTPs", "Billion-Ton-Study", "Plan-Do-Study-Act", "eCLS-B"]
ignore_labels.extend(bad_labels)

In [None]:
def get_data_ignore(data, ignore_labels):
    for ig_label in ignore_labels:
        data = data[data.label != ig_label]
    return data

In [None]:
train_df = get_data_ignore(train_df, ignore_labels)

In [None]:
original_labels = train_df.label.tolist()

In [None]:
re_labels = []

for org_label in tqdm(original_labels):
    if org_label != "":
        single_labels = org_label.split("|")
        valid_labels = list(set(single_labels) - set(ignore_labels))
        re_labels.append("|".join(valid_labels))
    else:
        re_labels.append(org_label)

In [None]:
train_df.label = re_labels

In [None]:
np.sum(train_df.label != "")

In [None]:
import os

In [None]:
texts = train_df[train_df.label != ""].text.tolist()
labels = train_df[train_df.label != ""].label.tolist()

In [None]:
os.makedirs(f"{settings['PROCESSED_DATA_DIR']}", exist_ok=True)

In [None]:
# we will split all samples that >= 2 unique label for test set. (only positive samples)
train_df["is_multiple_label"] = train_df.label.apply(lambda x: "|" in x)
train_df[train_df.is_multiple_label][train_df.columns[:-1]].to_csv(
    f"{settings['PROCESSED_DATA_DIR']}/test_positive_sampled.csv", index=False
)

In [None]:
train_df = train_df[~train_df.is_multiple_label][train_df.columns[:-1]]

In [None]:
train_df_positive = train_df[train_df.label != ""]
train_df_negative = train_df[train_df.label == ""]

In [None]:
train_df_negative["group"] = [int(0)] * len(train_df_negative)

In [None]:
# 1 unique label is 1 group
label_to_idx = {}
label_to_idx[''] = int(0)
all_groups = []
all_groups.append('')
idx = 1
for k in all_posible_labels:
    label_to_idx[k.strip()] = int(idx)
    label_to_idx[clean_text(k).strip()] = int(idx)
    idx += 1
    all_groups.append(k)

In [None]:
train_df_positive["group"] = train_df_positive.label.parallel_apply(
    lambda x: label_to_idx[clean_text(x).strip()])

In [None]:
valid_labels = []
valid_labels.extend(valid_extra_labels)

In [None]:
val_groups = [label_to_idx[val_label] for val_label in valid_labels]
len(val_groups)

In [None]:
val_df_split = train_df_positive[train_df_positive.group.isin(val_groups)]
train_df_split = train_df_positive[~train_df_positive.group.isin(val_groups)]

In [None]:
len(train_df_split)

In [None]:
len(val_df_split)

In [None]:
all_valid_labels = []
for i in tqdm(range(len(val_df_split))):
    val_label = val_df_split.iloc[i].label.split("|")
    all_valid_labels.extend(val_label)

all_train_labels = []
for i in tqdm(range(len(train_df_split))):
    train_label = train_df_split.iloc[i].label.split("|")
    all_train_labels.extend(train_label)

In [None]:
all_valid_labels = list(set(all_valid_labels) - set(['']))
all_train_labels = list(set(all_train_labels) - set(['']))

In [None]:
len(all_valid_labels)

In [None]:
len(all_train_labels)

In [None]:
# remove train samples that include valid labels
all_train_texts = train_df_split["text"].parallel_apply(lambda x: clean_text(x)).tolist()
def check_exist(text, labels):
    for i in range(len(labels)):
        l = clean_text(labels[i]).strip()
        if (" " + l + " " in text or " " + l + "," in text or " " + l + "." in text or " " + l + ";" in text):
            return True
    return False
train_choosen_idxs = []
for i, text in tqdm(enumerate(all_train_texts)):
    exist = check_exist(text, all_valid_labels)
    if exist is False:
        train_choosen_idxs.append(i)
train_df_split = train_df_split.iloc[train_choosen_idxs]
len(train_df_split)

In [None]:
# remove val samples that include train labels
all_val_texts = val_df_split["text"].parallel_apply(lambda x: clean_text(x)).tolist()
def check_exist(text, labels):
    for i in range(len(labels)):
        l = clean_text(labels[i]).strip()
        if (" " + l + " " in text or " " + l + "," in text or " " + l + "." in text or " " + l + ";" in text):
            return True
    return False
val_choosen_idxs = []
for i, text in tqdm(enumerate(all_val_texts)):
    exist = check_exist(text, all_train_labels)
    if exist is False:
        val_choosen_idxs.append(i)
val_df_split = val_df_split.iloc[val_choosen_idxs]
len(val_df_split)

In [None]:
# 75 % negative samples for train, 25 % for validation
train_df_negative_split = train_df_negative.sample(frac=0.75,random_state=200)
val_df_negative_split = train_df_negative.drop(train_df_negative_split.index)

In [None]:
train_df_split = pd.concat([train_df_split, train_df_negative_split])
val_df_split = pd.concat([val_df_split, val_df_negative_split])

In [None]:
all_train_labels = list(set(train_df_split.label.unique()) - set([""]))
all_valid_labels = list(set(val_df_split.label.unique()) - set([""]))

In [None]:
multiple_positive_df = pd.read_csv(f"{settings['PROCESSED_DATA_DIR']}/test_positive_sampled.csv")

In [None]:
def is_in_train(x):
    all_single_labels = x.split("|")
    all_single_labels = [l.strip() for l in all_single_labels]
    for l in all_single_labels:
        if l not in all_train_labels:
            return False
    return True

def is_in_valid(x):
    all_single_labels = x.split("|")
    all_single_labels = [l.strip() for l in all_single_labels]
    for l in all_single_labels:
        if l not in all_valid_labels:
            return False
    return True

In [None]:
multiple_positive_df["in_train"] = multiple_positive_df.label.apply(lambda x: is_in_train(x))
multiple_positive_df["in_valid"] = multiple_positive_df.label.apply(lambda x: is_in_valid(x))

In [None]:
multiple_positive_df_train = multiple_positive_df[multiple_positive_df.in_train==True]
multiple_positive_df_valid = multiple_positive_df[multiple_positive_df.in_valid==True]

In [None]:
train_df_split = pd.concat([train_df_split, multiple_positive_df_train])
val_df_split = pd.concat([val_df_split, multiple_positive_df_valid])

In [None]:
train_df_split = train_df_split[train_df_split.columns[:-2]].fillna("")
val_df_split = val_df_split[val_df_split.columns[:-2]].fillna("")

In [None]:
def replace_group(x):
    # all samples that contain >= 2 labels is assigned
    # as a group idx 10000
    if x == "":
        return 10000
    else:
        return int(x)

In [None]:
train_df_split["group"] = train_df_split.group.apply(lambda x: replace_group(x))

In [None]:
val_df_split["group"] = val_df_split.group.apply(lambda x: replace_group(x))

In [None]:
val_df_split.to_csv(f"{settings['PROCESSED_DATA_DIR']}/val_sampled.csv",index=False)
train_df_split[train_df_split.columns[:-1]].to_csv(
    f"{settings['PROCESSED_DATA_DIR']}/train_sampled.csv", index=False
)