In [1]:
from itertools import combinations
import random
import numpy as np

In [2]:
train_pth = 'data/split_data/train.txt'
test_pth = 'data/split_data/test.txt'

train_out_pth = ('data/processed_data/train_x.npy', 'data/processed_data/train_y.npy')
test_out_pth = ('data/processed_data/test_x.npy', 'data/processed_data/test_y.npy')

max_unique = 18
max_len = 32
output_size = 26
max_ret =50

random.seed(42)

In [3]:
# generate all possible combinations in words of 1-max_unique size
def get_permutations(n):
    l = [x for x in range(n)]

    all_combs = []

    for s in range(1, n+1):
        combs = combinations(l, s)
        comb_list = [x for x in combs]
        all_combs += comb_list

    return all_combs

all_combs = [None,]
for n in range(1,max_unique+1):
    all_combs.append(get_permutations(n))

In [4]:
def apply_opr(word, opr):
    enc_word = [0 for _ in range(max_len)]
    label = [0 for _ in range(output_size)]
    letters = list(dict.fromkeys(word))

    for o in opr:
        m_letter = letters[o]
        label[ord(m_letter) - ord('a')] = 1
        word = word.replace(m_letter, '_')

    for i,l in enumerate(word):
        if l == '_':
            enc_word[i] = 27
        else:
            enc_word[i] = ord(l) - ord('a') + 1

    return enc_word, label

In [5]:
def process_word(word):
    letters = list(set(word))
    combs = all_combs[len(letters)]
    random.shuffle(combs)
    combs = combs[:min(len(combs), max_ret)]

    x = []
    y = []
    for opr in combs:
        enc_word, label = apply_opr(word, opr)
        x.append(enc_word)
        y.append(label)


    return x,y

In [6]:
def process_words(words):
    allx = []
    ally = []

    for w in words:
        x, y = process_word(w)
        allx += x
        ally += y

    xymap = {}
    for x,y in zip(allx, ally):
        x = tuple(x)
        if x not in xymap:
            xymap[x] = [y]
        else:
            xymap[x].append(y)

    allx = []
    ally = []
    for x, ys in xymap.items():
        y = list(np.array(ys).mean(axis=0))
        
        for _ in ys:
            allx.append(list(x))
            ally.append(y)

    return allx, ally

In [7]:
def process_dataset(path, out_path):
    with open(path) as f:
        words = f.read().splitlines()

    allx, ally = process_words(words)

    np.save(out_path[0], np.array(allx))
    np.save(out_path[1], np.array(ally))

In [8]:
process_dataset(train_pth, train_out_pth)
process_dataset(test_pth, test_out_pth)