In [1]:
import json
import itertools
from pathlib import Path

import numpy as np
import pandas as pd

from utils import load_json
from arguments import PreprocessArgs
from preprocess import Preprocessor

data_path = Path("./dataset/train.csv")
data = pd.read_csv(data_path).drop(["Unnamed: 6", "total no.: 7987"], axis=1)

## Inspection

In [None]:
data.loc[:,["q", "r", "s", "q'", "r'"]] = data[["q", "r", "s", "q'", "r'"]].applymap(lambda s: s.strip('"'))
data

In [None]:
from typing import List
from tqdm import tqdm

def get_labeled_span_indices(ref: str, ans: str) -> List[List[int]]:
    s = 0
    e = len(ans)
    spans = list()
    while (s < len(ans)):
        cur_span = ans[s:e]
        span_s = ref.find(cur_span)
        if (span_s != -1):
            spans.append([span_s, span_s + len(cur_span)])
            s = e
            e = len(ans)
        else:
            e = e - 1
    return spans

labeled_span_indices_l = {
    "q": list(),
    "r": list()
}

for field in ['q', 'r']:
    for i, row in tqdm(data.iterrows()):
        ref = row[field]
        ans = row[field + "'"]
        spans = get_labeled_span_indices(ref, ans)
        labeled_span_indices_l[field].append(spans)

In [None]:
consecutive_count = {
    'q': 0,
    'r': 0
}

for field in ['q', 'r']:
    spans_l = labeled_span_indices_l[field]
    for spans in spans_l:
        if len(spans) == 1:
            consecutive_count[field] += 1

consecutive_count

In [None]:
from typing import Tuple

def label_sequence(ref: str, ans: str, ref_offsets: List[Tuple[int]]):
    labels = [0] * len(ref_offsets)
    labeled_spans = get_labeled_span_indices(ref, ans)

    if not labeled_spans:
        return labels

    cur = 0
    for i in range(len(ref_offsets)):
        cur_labeled_span = labeled_spans[cur]
        ref_offset = ref_offsets[i]
        if (cur_labeled_span[0] <= ref_offset[0]) and (ref_offset[1] <= cur_labeled_span[1]): # if the ref_offset is in the current labeled span
            labels[i] = 1
        elif (ref_offset[0] < cur_labeled_span[0]): # if the ref_offset is to the left of current labeled span
            pass # do nothing
        elif (ref_offset[1] > cur_labeled_span[1]):
            cur += 1
            if cur >= len(labeled_spans):
                break
        else:
            raise Exception("This condition should be happen.")

    return labels

## Preprocess Class

In [None]:
input_schemes = PreprocessArgs.input_schemes
output_schemes = PreprocessArgs.output_schemes
labeling_schemes = PreprocessArgs.labeling_schemes # 1: use the same class for q' and r' / 2: use different classes (e.g., I-q and I-r) for q' and r'

args = PreprocessArgs(
    use_nltk=False,
    model_tokenizer_name="bert-base-uncased",
    input_scheme="qr",
    output_scheme="q'r'",
    labeling_scheme="IO1"
)

preprocessor = Preprocessor(args)

In [None]:
p_data = preprocessor(data)

In [None]:
ys = {
    'q': list(),
    'r': list()
}
labeled_span_indices_l = list()

for field in ['q', 'r']:
    for i, row in tqdm(data.iterrows()):
        ref = row[field]
        ans = row[field + "'"]
        _, ref_offset = preprocessor.model_tokenize(ref)
        labeled_span_indices = get_labeled_span_indices(ref, ans)
        labeled_span_indices_l.append(labeled_span_indices)
        labels = label_sequence(ref, ans, ref_offset)
        ys[field].append(labels)

In [None]:
data[data.id == 9]

In [None]:
ys['q'][3:7]

In [None]:
Q, R, S, QP, RP = [data[field] for field in ["q", "r", "s", "q'", "r'"]]

for input_scheme in PreprocessArgs.input_schemes:
    for output_scheme in PreprocessArgs.output_schemes:
        for labeling_scheme in PreprocessArgs.labeling_schemes:
            args = PreprocessArgs(
                use_nltk=False,
                model_tokenizer_name="bert-base-uncased",
                input_scheme=input_scheme,
                output_scheme=output_scheme,
                labeling_scheme=labeling_scheme
            )
            print(f"\nScheme: {input_scheme} / {output_scheme} / {labeling_scheme}\n")
            preprocessor.set_args(args)

            index = 15

            q = preprocessor.model_tokenize(Q[index])
            r = preprocessor.model_tokenize(R[index])
            s = S[index]
            qp = preprocessor.label_sequence(q, preprocessor.model_tokenize(QP[index]))
            rp = preprocessor.label_sequence(r, preprocessor.model_tokenize(RP[index]))

            X, y = preprocessor.format_data(q, r, s, qp, rp)

            if type(y) == tuple:
                y_cls, y_seq = y
                print(f"y_cls: {y_cls}")
            else:
                y_seq = y

            print(f"X: {X} -> {' '.join(preprocessor.model_tokenizer.convert_ids_to_tokens(X))}")
            print(f"y: {y_seq} -> {' '.join(preprocessor.model_tokenizer.convert_ids_to_tokens(np.array(X)[np.array(y_seq) != 0]))}")

## Splitting

In [2]:
split_ids = load_json(file="./dataset/splitIds__splitBy-id_stratifyBy-s_train-0.6_valid-0.2_test-0.2_seed-42.json")

# train_data, valid_data, test_data = [data[data.id.isin(split_ids[split])] for split in ["train", "valid", "test"]]
# assert len(train_data) + len(valid_data) + len(test_data) == len(data)

In [None]:
our_all_answers = data[["id", "q'", "r'"]]
our_all_answers.to_csv(Path("./dataset/our_all_answers.csv"), index=False, header=False)

In [None]:
ans_df = pd.read_csv(Path("./dataset/our_all_answers.csv"), names=["id", "q'", "r'"])
pred_df = ans_df.groupby("id").sample()
pred_df.to_csv(Path("./dataset/our_all_predictions_allqr.csv"), index=False, header=False)

### Cross-Validation Splits for Our Testing

In [59]:
from typing import List, Dict
from sklearn.model_selection import StratifiedKFold

def check_splits(splits: List[Dict[str, List[int]]], cv_ids: List[int]):
    valid_ids_across_splits = list()
    for split in splits:
        assert sorted(split["train"] + split["valid"]) == cv_ids
        valid_ids_across_splits += split["valid"]

    assert sorted(valid_ids_across_splits) == cv_ids

# Make 1 cross validation split
def make_cv_splits(cv_ids: List[int], cv_data: pd.DataFrame, n_splits: int, seed: int) -> List[Dict[str, List[int]]]:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    print(f"Currently making CV splits of seed {seed}: \n{skf}")

    splits = list()
    for train_index, valid_index in skf.split(X=cv_ids, y=cv_data.groupby("id").first().s.values):
        train_ids = sorted([cv_ids[index] for index in train_index])
        valid_ids = sorted([cv_ids[index] for index in valid_index])
        assert sorted(train_ids + valid_ids) == cv_ids

        split = {
            "train": train_ids,
            "valid": valid_ids
        }
        splits.append(split)
    
    check_splits(splits, cv_ids)
    return splits

In [62]:
# Merge train & valid splits for cross-validation
cv_ids = sorted(split_ids["train"] + split_ids["valid"])
assert len(set(cv_ids) & set(split_ids["test"])) == 0
cv_data = data[data.id.isin(cv_ids)]

# Make 10 different cross validation splits
n_splits = 5
filepath_prefix = f"./dataset/cross_validation/our_testing/splitIds__nsplits-{n_splits}"
seeds = range(10)
for seed in seeds:
    splits = make_cv_splits(cv_ids, cv_data, n_splits, seed)

    filepath = f"{filepath_prefix}_seed-{seed}.json"
    Path(filepath).write_text(data=json.dumps(obj=splits))

Currently making CV splits of seed 0: 
StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
Currently making CV splits of seed 1: 
StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
Currently making CV splits of seed 2: 
StratifiedKFold(n_splits=5, random_state=2, shuffle=True)
Currently making CV splits of seed 3: 
StratifiedKFold(n_splits=5, random_state=3, shuffle=True)
Currently making CV splits of seed 4: 
StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
Currently making CV splits of seed 5: 
StratifiedKFold(n_splits=5, random_state=5, shuffle=True)
Currently making CV splits of seed 6: 
StratifiedKFold(n_splits=5, random_state=6, shuffle=True)
Currently making CV splits of seed 7: 
StratifiedKFold(n_splits=5, random_state=7, shuffle=True)
Currently making CV splits of seed 8: 
StratifiedKFold(n_splits=5, random_state=8, shuffle=True)
Currently making CV splits of seed 9: 
StratifiedKFold(n_splits=5, random_state=9, shuffle=True)


### Cross-Validation Splits for Final Submissions

In [None]:
# Merge train, valid, and test splits for final cross-validation

# Make 10 different cross validation splits