# Process Compatibility Matrix

In [27]:
import yaml
import pandas
from pathlib import Path
import os
import json
import glob
import unicodedata
import re
import random
import itertools

def load_config(yaml_config: str):
    with open(yaml_config) as f:
        config = yaml.safe_load(f)

    return config

config = load_config('./config.yaml')['data_processing']

In [28]:
df_compat = pandas.read_excel(Path(config['input_compat_data_folder']) / config['input_compat_excel_filename'])
df_compat = df_compat[['OPTION CODE', 'OPTION CODE COMPAT']].drop_duplicates()

# Get Action Text

In [29]:

# Function to replace u/XXXX with the corresponding Unicode character
def replace_unicode_notation(match):
    unicode_code = match.group(1)
    return chr(int(unicode_code, 16))

sfi_action_files =  f"{config['sfi_actions_folder']}*.txt"
glob.glob(sfi_action_files)
action_json = {}

for filepath in glob.glob(sfi_action_files):
    code = filepath.split('/')[-1].split('-')[0].upper()
    with open(filepath, 'r') as f:
        text = f.read()
    cleaned_text = re.sub(r'u/([0-9A-Fa-f]{4})', replace_unicode_notation, text)
    action_json[code] = cleaned_text



In [43]:
full_action_list = [i for i in action_json]
random.shuffle(full_action_list)

# full_action_list = [action_json.keys()]


train_actionlist = full_action_list[:86] #86 actions to use for training
eval_actionlist = full_action_list[86:91] # 5 actions for eval during training
test_actionlist = full_action_list[:91] #removing 10 actions to serve as testing which will not be seen by the model

In [53]:
df_compat.groupby('OPTION CODE').count().reset_index(drop=False)['OPTION CODE COMPAT'].mean()#.sort_values(by='OPTION CODE COMPAT', ascending=False)

29.54054054054054

# Construct Training Dataset

In [44]:
def create_cartesian(list1, list2):
    product =  itertools.product(*[list1, list2])
    return product


def construct_dataset(cartesian, action_json, df_compat):
    dataset = []
    for i in cartesian:
        try:
            if i[0] == i[1]: #remove self references
                continue
            text1 = action_json[i[0]]
            text2 = action_json[i[1]]
            compatibility = len(df_compat.loc[(df_compat['OPTION CODE'] == i[0]) \
                                              & (df_compat['OPTION CODE COMPAT'] == i[1])])
            entry =  {'text1': text1, 'text2': text2, 'code1': i[0], 'code2': i[1], 'label': compatibility}
            dataset.append(entry)
        except Exception as e:
            print(e)
            continue
    return dataset


def delete_file_if_exists(filename):
    if os.path.exists(filename):
        # Delete the file
        os.remove(filename)
        print(f"{filename} has been deleted.")
    else:
        print(f"{filename} does not exist.")


# Todo: I have removed the labels from the data so that it doesn't make its way into the model for training. 
# I do remember the DataLoader class which we have to use having an argument for elements which are not used for training
# If we can implement that then we only need to construct one dataset.
def save_outputs(dataset, full_filename, reduced_filename):
    delete_file_if_exists(full_filename)
    delete_file_if_exists(reduced_filename)

    for line in dataset:
        with open(full_filename, 'a') as f:
            f.write(json.dumps(line) + "\n")
        reduced_line = {'text1': line['text1'], 'text2': line['text2'], 'label': line['label']}
        with open(reduced_filename, 'a') as f:
            f.write(json.dumps(reduced_line) + "\n")

    return f'{full_filename} and {reduced_filename} datasets created'


def pipeline(list1, list2, action_json, df_compat, full_filename, reduced_filename):
    cartesian = create_cartesian(list1, list2)
    dataset = construct_dataset(cartesian, action_json, df_compat)
    save_outputs(dataset, full_filename, reduced_filename)


pipeline(train_actionlist, train_actionlist, action_json, df_compat,'full_training_dataset.jsonl', 'training_dataset.jsonl')
pipeline(eval_actionlist, eval_actionlist, action_json, df_compat,'full_eval_dataset.jsonl', 'eval_dataset.jsonl')
pipeline(test_actionlist, full_action_list, action_json, df_compat,'full_test_dataset.jsonl', 'test_dataset.jsonl')

full_training_dataset.jsonl has been deleted.
training_dataset.jsonl has been deleted.
full_eval_dataset.jsonl has been deleted.
eval_dataset.jsonl has been deleted.
full_test_dataset.jsonl has been deleted.
test_dataset.jsonl has been deleted.
