In [None]:
import json
import pandas as pd
import numpy as np
import ast
import spacy
import re
from collections import deque
import random
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import copy
import os

from google.colab import drive
drive.mount('/content/drive')

In [None]:
dir = ''

In [None]:
# Load Spacy
nlp = spacy.load("en_core_web_sm")

# Initiate fixed random seed
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

In [None]:
with open(f'{dir}/type_ids.json', 'r') as f:
    type_ids = json.load(f)
with open(f'{dir}/type_id_samples_list.json', 'r') as f:
    type_id_samples_list = json.load(f)
with open(f'{dir}/token_length_cumulative_relative_frequency.json', 'r') as f:
    token_length_cumulative_relative_frequency = json.load(f)

In [None]:
# Create numbers 0–18 inclusive
numbers = list(range(19))
numbers.remove(type_ids.index("UnknownType"))

# Shuffle them
random.shuffle(numbers)

# Split into 6, 6, 7
split1 = numbers[:6]
split2 = numbers[6:12]
split3 = numbers[12:]

print("Split 1:", split1)
print("Split 2:", split2)
print("Split 3:", split3)

Split 1: [0, 18, 10, 7, 9, 13]
Split 2: [1, 8, 5, 2, 15, 6]
Split 3: [11, 16, 12, 14, 17, 3, 4]


In [None]:
# Train test split
task_sample_ratio = 0.003
validation_sample_ratio = 0.1

for letter, split in zip(["a", "b", "c"], [split1, split2, split3]):
    task_set_type_id_list_of_samples = {}
    validation_set_type_id_list_of_samples = {}
    test_task_set_type_id_list_of_samples = {}
    test_validation_set_type_id_list_of_samples = {}

    for i in tqdm([i for i in range(19) if not i in split]):
        task_sample_number = int((len(type_id_samples_list[type_ids[i]]))*task_sample_ratio)
        validation_sample_number = int((len(type_id_samples_list[type_ids[i]]))*validation_sample_ratio)
        shuffled = type_id_samples_list[type_ids[i]]
        random.shuffle(shuffled)
        task_set_type_id_list_of_samples[i] = shuffled[:task_sample_number]
        validation_set_type_id_list_of_samples[i] = shuffled[task_sample_number:task_sample_number+validation_sample_number]

    for i in tqdm(split):
        task_sample_number = int((len(type_id_samples_list[type_ids[i]]))*task_sample_ratio)
        validation_sample_number = int((len(type_id_samples_list[type_ids[i]]))*validation_sample_ratio)
        shuffled = type_id_samples_list[type_ids[i]]
        random.shuffle(shuffled)
        test_task_set_type_id_list_of_samples[i] = shuffled[:task_sample_number]
        test_validation_set_type_id_list_of_samples[i] = shuffled[task_sample_number:task_sample_number+validation_sample_number]

    # Define the base directory
    output_dir = f'{dir}/tag_set_extension/{len(type_ids)}_way/{task_sample_ratio}'

    # Create the directory and all necessary parents if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save task_set_type_id_list_of_samples into json file
    with open(f'{output_dir}/{letter}_task_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(task_set_type_id_list_of_samples, f)

    # Save validation_set_type_id_list_of_samples into json file
    with open(f'{output_dir}/{letter}_validation_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(validation_set_type_id_list_of_samples, f)

    # Save test_task_set_type_id_list_of_samples into json file
    with open(f'{output_dir}/{letter}_test_task_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(test_task_set_type_id_list_of_samples, f)

    # Save test_validation_set_type_id_list_of_samples into json file
    with open(f'{output_dir}/{letter}_test_validation_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(test_validation_set_type_id_list_of_samples, f)

In [None]:
from google.colab import runtime

runtime.unassign()