In [26]:
import pandas as pd
import random

inflections = pd.read_excel("inflections.xlsx", sheet_name="data")
inflections.head()

Unnamed: 0,id,root,correct,incorrect1,incorrect2,incorrect3,affix_type,prefix,infix,suffix
0,1,inom,uminom,inom,ininom,kakainom,prefix,um,,
1,2,aral,mag-aral,aral,pag-aral,ma-aral,prefix,mag,,
2,3,basketbol,nagbasketbol,basketbol,magbasketbol,nabasketbol,prefix,nag,,
3,4,pasyal,mamasyal,pasyal,pinasyal,pasyalan,prefix,ma,,
4,5,ayos,inayos,ayos,umayos,iayos,prefix,in,,


In [27]:
def prepare_mcq_outputs(text_en, text_tl, mcq_options, row={}, kwargs={}):
    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row, **kwargs),
            "text_tl": text_tl.format(**row, **kwargs),
            "mcq_options": mcq_options,
        }],
    }
    return outputs

In [34]:
def create_mcq_affixation(row, affix_type):
    if affix_type == "prefix":
        text_en = 'Which option has the prefix "{prefix}-"?'
        text_tl = 'Alin sa mga sumusunod ang may unlaping "{prefix}-"?'
    elif affix_type == "suffix":
        text_en = 'Which option has the suffix "-{suffix}"?'
        text_tl = 'Alin sa mga sumusunod ang may hulaping "-{suffix}"?'
    elif affix_type == "infix":
        text_en = 'Which option has the infix "-{infix}-"?'
        text_tl = 'Alin sa mga sumusunod ang may gitlaping "-{infix}-"?'
    elif affix_type == "circumfix":
        text_en = 'Which option has the circumfix "{prefix}-" and "-{suffix}"?'
        text_tl = 'Alin sa mga sumusunod ang may kabilangang-laping "{prefix}-" at "-{suffix}"?'
    else:
        raise ValueError("Invalid affix type. Choose from 'prefix', 'suffix', or 'infix'.")
    
    mcq_options = {
        "correct": row["correct"],
        "incorrect1": row["incorrect1"],
        "incorrect2": row["incorrect2"],
        "incorrect3": row["incorrect3"],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs={})
    return outputs

In [35]:
int2label = {0: "A", 1: "B", 2: "C", 3: "D"}

def create_mcq_dataset(dataset):
    mcq_dataset = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for _, row in dataset.iterrows():
        affix_type = row["affix_type"]
        outputs = create_mcq_affixation(row, affix_type)
        mcq_dataset = pd.concat([mcq_dataset, pd.DataFrame([{
            "category": "affixation",
            "subcategory": affix_type,
            "prompts": outputs["prompts"],
        }])], ignore_index=True)

    for i in range(len(mcq_dataset)):
        label_index = i % 4
        correct = mcq_dataset.iloc[i]['prompts'][0]["mcq_options"]['correct']
        options = [
            mcq_dataset.iloc[i]['prompts'][0]["mcq_options"]['incorrect1'],
            mcq_dataset.iloc[i]['prompts'][0]["mcq_options"]['incorrect2'],
            mcq_dataset.iloc[i]['prompts'][0]["mcq_options"]['incorrect3'],
        ]
        random.shuffle(options)

        options.insert(label_index, correct)
        choices = {
            "choice1": options[0],
            "choice2": options[1],
            "choice3": options[2],
            "choice4": options[3],
        }
        label = int2label[label_index]
        mcq_dataset.at[i, 'prompts'][0].update(choices)
        mcq_dataset.at[i, 'label'] = label

    return mcq_dataset

mcq_dataset = create_mcq_dataset(inflections)
mcq_dataset.to_json("data/mcq_affixation_dataset.jsonl", lines=True, orient="records", force_ascii=False)
mcq_dataset.head()

Unnamed: 0,category,subcategory,prompts,label
0,affixation,prefix,"[{'text_en': 'Which option has the prefix ""um-...",A
1,affixation,prefix,"[{'text_en': 'Which option has the prefix ""mag...",B
2,affixation,prefix,"[{'text_en': 'Which option has the prefix ""nag...",C
3,affixation,prefix,"[{'text_en': 'Which option has the prefix ""ma-...",D
4,affixation,prefix,"[{'text_en': 'Which option has the prefix ""in-...",A


In [16]:
def prepare_gen_outputs(text_en, text_tl, label, row={}, kwargs={}):
    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row, **kwargs),
            "text_tl": text_tl.format(**row, **kwargs),
        }],
        "label": label
    }
    return outputs

In [22]:
def create_gen_affixation(row, affix_type):
    if affix_type == "prefix":
        text_en = 'Inflect the word "{root}" to use the prefix "{prefix}-".'
        text_tl = 'Ikabit ang unlaping "{prefix}-" sa salitang "{root}".'
    elif affix_type == "suffix":
        text_en = 'Inflect the word "{root}" to use the suffix "-{suffix}".'
        text_tl = 'Ikabit ang hulaping "-{suffix}" sa salitang "{root}".'
    elif affix_type == "infix":
        text_en = 'Inflect the word "{root}" to use the infix "-{infix}-".'
        text_tl = 'Ikabit ang gitlaping "-{infix}-" sa salitang "{root}".'
    elif affix_type == "circumfix":
        text_en = 'Inflect the word "{root}" to use the circumfix "{prefix}-" and "-{suffix}"?'
        text_tl = 'Ikabit ang kabilangang-laping "{prefix}-" at "-{suffix}" sa salitang "{root}".'
    else:
        raise ValueError("Invalid affix type. Choose from 'prefix', 'suffix', or 'infix'.")

    label = row["correct"]
    outputs = prepare_gen_outputs(text_en, text_tl, label, row=row)
    return outputs

create_gen_affixation({"root": "inom", "prefix": "um", "correct": "uminom"}, affix_type="prefix")

{'prompts': [{'text_en': 'Inflect the word "inom" to use the prefix "um-".',
   'text_tl': 'Ikabit ang unlaping "um-" sa salitang "inom".'}],
 'label': 'uminom'}

In [25]:
def create_gen_dataset(dataset):
    gen_dataset = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for _, row in dataset.iterrows():
        affix_type = row["affix_type"]
        outputs = create_gen_affixation(row, affix_type)
        gen_dataset = pd.concat([gen_dataset, pd.DataFrame([{
            "category": "affixation",
            "subcategory": affix_type,
            "prompts": outputs["prompts"],
            "label": outputs["label"]
        }])], ignore_index=True)
    return gen_dataset

gen_dataset = create_gen_dataset(inflections)
gen_dataset.to_json("data/gen_affixation_dataset.jsonl", lines=True, orient="records", force_ascii=False)
gen_dataset.head()

Unnamed: 0,category,subcategory,prompts,label
0,affixation,prefix,"[{'text_en': 'Inflect the word ""inom"" to use t...",uminom
1,affixation,prefix,"[{'text_en': 'Inflect the word ""aral"" to use t...",mag-aral
2,affixation,prefix,"[{'text_en': 'Inflect the word ""basketbol"" to ...",nagbasketbol
3,affixation,prefix,"[{'text_en': 'Inflect the word ""pasyal"" to use...",mamasyal
4,affixation,prefix,"[{'text_en': 'Inflect the word ""ayos"" to use t...",inayos
