### Need to measure variety of data created by PPO.

In [159]:
# Test Data

import torch
from _language import InputLang, OutputLang

N_CHARS = len(InputLang.chars)

random_chars = (N_CHARS * torch.rand(100)).int()

one_char = torch.ones(100)

half_and_half = torch.zeros(100)
half_and_half[50:] = torch.ones(50)
half_and_half[:50] = (N_CHARS * torch.rand(50)).int()

three = torch.zeros(100)
three[:30] = torch.ones(30)
three[30:60] = 2*torch.ones(30)
three[60:] = 4*torch.ones(40)

fake_count = torch.tensor([35,  27, 17,  7,  7, 1, 0,  1, 1,  2,  1,  1,  0])
fake_count.sum()

tensor(100)

### standard deviation of char count

In [141]:
(
    one_char.float().std(),
    three.float().std(),
    random_chars.float().std(),
    half_and_half.float().std(),
    fake_count.float().std()
)

(tensor(0.), tensor(1.2910), tensor(3.7340), tensor(3.6052), tensor(11.4774))

### max count - min count

In [144]:
random_chars.unique(return_counts=True)[1].max() - random_chars.unique(return_counts=True)[1].min()

tensor(9)

In [143]:
three.unique(return_counts=True)[1].max() - three.unique(return_counts=True)[1].min()

tensor(10)

In [167]:
fake_count.max() - fake_count.min()

tensor(35)

In [145]:
half_and_half.unique(return_counts=True)[1].max() - half_and_half.unique(return_counts=True)[1].min()

tensor(50)

### max count

In [169]:
(
    random_chars.unique(return_counts=True)[1].max(),
    fake_count.max(),
    three.unique(return_counts=True)[1].max(),
    half_and_half.unique(return_counts=True)[1].max()
)

(tensor(13), tensor(35), tensor(40), tensor(55))

### Sum top 2 counts

In [181]:
(
    random_chars.unique(return_counts=True)[1].topk(2).values.sum(),
    fake_count.topk(2).values.sum(),
    half_and_half.unique(return_counts=True)[1].topk(2).values.sum(),
    three.unique(return_counts=True)[1].topk(2).values.sum()
)

(tensor(23), tensor(62), tensor(62), tensor(70))

In [183]:
(
    random_chars.unique(return_counts=True)[1].topk(3).values.sum(),
    half_and_half.unique(return_counts=True)[1].topk(3).values.sum(),
    fake_count.topk(3).values.sum(),
    three.unique(return_counts=True)[1].topk(3).values.sum()
)

(tensor(33), tensor(68), tensor(79), tensor(100))

### entropy

In [146]:
from scipy.stats import entropy

In [215]:
(
    entropy(fake_count, base=2),
    entropy(half_and_half, base=2),
    entropy(random_chars, base=2),
    entropy(three, base=2),
    entropy(one_char, base=2),
)

(2.4568891361592278,
 5.981557972869691,
 6.337212874388087,
 6.44578455096824,
 6.643854906175126)

### Impliment for batches

In [216]:
batch = (N_CHARS * torch.rand(100, InputLang.maxlen)).int()
batch_lens = (InputLang.maxlen * torch.rand(100)).int()

In [235]:
batch[:, 0]

tensor([ 8, 12, 12,  9,  0,  7, 11, 11,  0,  1,  7,  3, 12,  4,  9,  5,  3,  4,
         7,  9,  6,  8,  1, 12,  0,  4,  4,  3,  6, 10,  5,  4, 10,  0,  8, 10,
         1,  0, 10,  8, 11, 11, 12,  6,  2,  4, 11,  3, 10, 10,  5,  9,  2,  9,
         2,  3,  9,  6,  8,  1,  2,  4,  5,  0, 10,  0,  0,  4,  9,  2,  9,  0,
        12,  0, 11,  0, 10,  7,  9,  2, 10,  4,  5, 10,  9, 11,  5, 12, 12,  2,
         7,  7,  1,  1,  2,  8,  3,  4,  6,  7], dtype=torch.int32)

In [237]:
batch[:, 0].unique(return_counts=True)

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=torch.int32),
 tensor([11,  6,  8,  6, 10,  6,  5,  7,  6, 10, 10,  7,  8]))

In [220]:
batch.unique(return_counts=True, dim=0)

(tensor([[ 0,  1,  5,  4,  0,  0, 12],
         [ 0,  1,  9,  6,  3,  9,  5],
         [ 0,  2,  0,  3,  4,  1,  2],
         [ 0,  2, 12,  3,  1,  6,  9],
         [ 0,  4,  2,  3,  1,  1, 11],
         [ 0,  4,  3,  6,  6,  5,  7],
         [ 0,  5,  1, 11, 12, 12,  9],
         [ 0, 10,  3,  8,  1,  8,  0],
         [ 0, 11,  0,  6, 10,  2,  2],
         [ 0, 11,  1, 12,  1,  2,  5],
         [ 0, 11,  3,  0, 11,  5, 11],
         [ 1,  0, 12, 11, 11,  0, 12],
         [ 1,  2,  7, 11,  4,  6,  8],
         [ 1,  7, 11,  6, 12,  7,  4],
         [ 1, 11,  6, 12,  6,  5,  9],
         [ 1, 11,  7,  3, 11,  7, 11],
         [ 1, 12,  7,  8,  1,  8, 10],
         [ 2,  3, 10,  3,  5,  3,  6],
         [ 2,  4,  0, 11, 10, 10,  3],
         [ 2,  4,  2,  4,  8, 11, 12],
         [ 2,  5,  0,  3,  7,  8, 11],
         [ 2,  5,  7,  4, 12,  5,  6],
         [ 2,  6,  6,  7,  2,  3,  3],
         [ 2,  7,  6,  4,  3,  8,  7],
         [ 2,  9,  0,  8,  4,  9, 12],
         [ 3,  0,  0,  4,

In [224]:
bu, bc = batch.unique(return_counts=True, dim=0)

In [236]:
bu[:,0].unique(return_counts=True)

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=torch.int32),
 tensor([11,  6,  8,  6, 10,  6,  5,  7,  6, 10, 10,  7,  8]))

In [233]:
bu[:,0][:11]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [214]:
help(torch.unique)

Help on function unique in module torch.functional:

unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None)
    Returns the unique elements of the input tensor.
    
    Arguments:
        input (Tensor): the input tensor
        sorted (bool): Whether to sort the unique elements in ascending order
            before returning as output.
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``
    
    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing
    
            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*

In [239]:
InputLang

_language.InputLang

In [243]:
InputLang.tensor_to_str(torch.ones(3), torch.tensor([2]))

'11'