In [11]:
import json
import itertools
from pathlib import Path

import numpy as np
import pandas as pd

from arguments import PreprocessArgs
from preprocess import Preprocessor

train_data_path = Path("./dataset/train.csv")
train_data = pd.read_csv(train_data_path).drop(["Unnamed: 6", "total no.: 7987"], axis=1)

## Inspection

In [2]:
train_data

Unnamed: 0,id,q,r,s,q',r'
0,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""It can go both ways . We all doubt . It is wh...","""True ."""
1,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""can go both ways . We all doubt . It is what ...","""True"""
2,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""It can go both ways . We all doubt . It is wh...","""True"""
3,9,"""once again , you seem to support the killing ...","""based on the idea that people are dispensible...",AGREE,"""seem to support the killing of certain people""","""based on the idea that people are dispensible..."
4,9,"""once again , you seem to support the killing ...","""based on the idea that people are dispensible...",AGREE,"""you seem to support the killing of certain pe...","""based on the idea that people are dispensible"""
...,...,...,...,...,...,...
38341,10001,"""good thing this argument has never been done ...","""And teen sex does n't , by the very nature of...",DISAGREE,"""You are much better off making theft legal an...","""And teen sex does n't , by the very nature of..."
38342,10002,"""I know one thing , anything that happens , po...","""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""I know one thing , anything that happens , po...","""Was n't sinjin crowing about his plans to tak..."
38343,10002,"""I know one thing , anything that happens , po...","""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""FBI Arrests Three Men in Terror Plot that Tar...","""Was n't sinjin crowing about his plans to tak..."
38344,10003,"""I enjoy Botany more than most things and I ha...","""Hi Smallax , welcome to the forum . I did a s...",AGREE,"""I enjoy Botany more than most things and I ha...","""Hi Smallax , welcome to the forum . I did a s..."


## Preprocess Class

In [10]:
input_schemes = PreprocessArgs.input_schemes
output_schemes = PreprocessArgs.output_schemes
labeling_schemes = PreprocessArgs.labeling_schemes # 1: use the same class for q' and r' / 2: use different classes (e.g., I-q and I-r) for q' and r'

args = PreprocessArgs(
    use_nltk=False,
    model_tokenizer_name="bert-base-uncased",
    input_scheme="qr",
    output_scheme="q'r'",
    labeling_scheme="IO1"
)

preprocessor = Preprocessor(args)

In [16]:
from tqdm import tqdm
debug_sample_size = 100

for i, (ip, op, lbl) in tqdm(enumerate(itertools.product(input_schemes, output_schemes, labeling_schemes))):
    args = PreprocessArgs(
        use_nltk=False,
        model_tokenizer_name="bert-base-uncased",
        input_scheme=ip,
        output_scheme=op,
        labeling_scheme=lbl
    )
    preprocessor.set_args(args)

    p_data = preprocessor(train_data.iloc[:debug_sample_size])
    
    save_path = Path(f"./dataset/p{i}")
    save_path.mkdir(parents=True, exist_ok=True)

    p_data.to_csv(save_path / "data.csv", index=False)
    (save_path / "pargs.json").write_text(json.dumps(vars(args), indent=4))

16it [00:02,  5.72it/s]


In [None]:
Q, R, S, QP, RP = [train_data[field] for field in ["q", "r", "s", "q'", "r'"]]

for input_scheme in PreprocessArgs.input_schemes:
    for output_scheme in PreprocessArgs.output_schemes:
        for labeling_scheme in PreprocessArgs.labeling_schemes:
            args = PreprocessArgs(
                use_nltk=False,
                model_tokenizer_name="bert-base-uncased",
                input_scheme=input_scheme,
                output_scheme=output_scheme,
                labeling_scheme=labeling_scheme
            )
            print(f"\nScheme: {input_scheme} / {output_scheme} / {labeling_scheme}\n")
            preprocessor.set_args(args)

            index = 15

            q = preprocessor.model_tokenize(Q[index])
            r = preprocessor.model_tokenize(R[index])
            s = S[index]
            qp = preprocessor.label_sequence(q, preprocessor.model_tokenize(QP[index]))
            rp = preprocessor.label_sequence(r, preprocessor.model_tokenize(RP[index]))

            X, y = preprocessor.format_data(q, r, s, qp, rp)

            if type(y) == tuple:
                y_cls, y_seq = y
                print(f"y_cls: {y_cls}")
            else:
                y_seq = y

            print(f"X: {X} -> {' '.join(preprocessor.model_tokenizer.convert_ids_to_tokens(X))}")
            print(f"y: {y_seq} -> {' '.join(preprocessor.model_tokenizer.convert_ids_to_tokens(np.array(X)[np.array(y_seq) != 0]))}")

## Unit Test

In [None]:
# test cases for add_labels_by_two
by_two_cases = [
    [0, 0, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0],
    [0, 1, 0, 1, 1],
    [0, 1, 0, 0, 1]
]

by_two_answers = list(map(preprocessor.add_labels_by_two, by_two_cases))

# add_B_to_labels
add_B_cases = [
    # 1
    [-100, 0, 1, 1, -100, 1, 1, 0, -100],
    [-100, 0, 0, 0, -100, 0, 1, 0, -100, -100, -100],
    [-100, 1, 1, 1, -100, 1, 0, 1, -100],
    # 2
    [-100, 1, 1, 0, -100, 3, 3, 3, -100, -100, -100],
    [-100, 1, 0, 1, 1, -100, 3, 0, 3, -100],
    [-100, 1, 0, 1, -100, 0, 3, 3, -100]
]

add_B_cases = list(map(preprocessor.add_B_to_labels, add_B_cases))