In [70]:
from PIL import Image
import pandas as pd
from statistics import mean, stdev

In [3]:
im_path = "../data/sprite/101_default_front_default.png"
shiny_path = "../data/sprite/101_default_front_shiny.png"

im = Image.open(im_path)
shiny_im = Image.open(shiny_path)
im.show()

In [27]:
def get_pixel_set(im:Image) -> set[tuple[int, int, int]]:
    x, y = im.size
    return { im.getpixel((i,j)) for i in range(x) for j in range(y) }

pix_set = get_pixel_set(im)
print(pix_set)

{(217, 225, 244), (15, 15, 15), (100, 108, 132), (86, 66, 35), (186, 196, 216), (197, 156, 63), (143, 149, 183), (55, 57, 64), (255, 255, 255), (152, 120, 46), (221, 193, 38)}


In [28]:
sorted(pix_set)

[(15, 15, 15),
 (55, 57, 64),
 (86, 66, 35),
 (100, 108, 132),
 (143, 149, 183),
 (152, 120, 46),
 (186, 196, 216),
 (197, 156, 63),
 (217, 225, 244),
 (221, 193, 38),
 (255, 255, 255)]

In [23]:
csv_path = "../data/data.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,image,mask,id,type_1,type_2,is_legendary,is_mythical,generation,egg_group_0,egg_group_1
0,sprite/1_default_front_default.png,masks/1_default_front_default_mask.png,1,grass,poison,False,False,generation-i,monster,plant
1,sprite/1_default_front_shiny.png,masks/1_default_front_default_mask.png,1,grass,poison,False,False,generation-i,monster,plant
2,sprite/1_generation-iii_emerald_front_default.png,masks/1_generation-iii_emerald_front_default_m...,1,grass,poison,False,False,generation-i,monster,plant
3,sprite/1_generation-iii_emerald_front_shiny.png,masks/1_generation-iii_emerald_front_default_m...,1,grass,poison,False,False,generation-i,monster,plant
4,sprite/1_generation-iii_firered-leafgreen_fron...,masks/1_generation-iii_firered-leafgreen_front...,1,grass,poison,False,False,generation-i,monster,plant


In [26]:
pixel_set_lengths = []

for path in df["image"]:
    im = Image.open("../data/"+path)
    pix_set = get_pixel_set(im)

    if len(pix_set) > 16:
        print(f"size {len(pix_set)}: {path}")

    pixel_set_lengths.append(len(pix_set))


size 17: sprite/245_generation-iii_emerald_front_default.png
size 20: sprite/666_default_front_default.png
size 17: sprite/680_default_front_default.png
size 17: sprite/680_default_front_shiny.png
size 27: sprite/716_default_front_default.png
size 19: sprite/906_default_front_default.png
size 19: sprite/909_default_front_default.png
size 17: sprite/911_default_front_default.png
size 17: sprite/944_default_front_default.png
size 19: sprite/952_default_front_default.png
size 17: sprite/964_default_front_default.png
size 17: sprite/988_default_front_default.png
size 19: sprite/989_default_front_default.png
size 17: sprite/993_default_front_default.png
size 19: sprite/1001_default_front_default.png
size 18: sprite/1003_default_front_default.png
size 19: sprite/10027_default_front_default.png
size 19: sprite/10028_default_front_default.png
size 19: sprite/10029_default_front_default.png
size 17: sprite/10030_default_front_default.png
size 17: sprite/10031_default_front_default.png
size 17: 

In [64]:
def tokenize_image(im:Image) -> list[int]:
    pix_set = get_pixel_set(im)

    pix_to_token = {pix: token for token, pix in enumerate(sorted(pix_set))}

    return [pix_to_token[im.getpixel((i,j))] for i in range(96) for j in range(96)]

tokenized = tokenize_image(im=im)
tokens = set(tokenized)
counts = { i: tokenized.count(i) for i in set(tokenized) }
counts


{0: 106,
 1: 95,
 2: 12,
 3: 46,
 4: 95,
 5: 49,
 6: 193,
 7: 35,
 8: 17,
 9: 28,
 10: 8540}

In [34]:
{ i: tokenized[:3000].count(i) for i in set(tokenized) }

{0: 2, 1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 1, 10: 2996}

In [50]:
# understand how many white spaces there are at the start of a pokemon
token_string = " ".join( map(lambda i: str(i), tokenized) )

to_int = lambda s: int(s)
ws_removed = list(map(to_int, token_string.strip("10 ").split(" ")))
leading_removed = list(map(to_int, token_string.lstrip("10 ").split(" ")))
trailing_removed = list(map(to_int, token_string.rstrip("10 ").split(" ")))
print(f"len(sequence): {len(tokenized)}, len(whitespace_removed): {len(ws_removed)}")
print(f"Leading whitespace: {len(tokenized) - len(leading_removed)} Trailing whitespace: {len(tokenized) - len(trailing_removed)}")

{ i: ws_removed.count(i) for i in set(tokenized) }


len(sequence): 9216, len(whitespace_removed): 3281
Leading whitespace: 2913 Trailing whitespace: 3022


{0: 101,
 1: 95,
 2: 12,
 3: 46,
 4: 95,
 5: 49,
 6: 193,
 7: 35,
 8: 17,
 9: 28,
 10: 2610}

In [65]:
# investigate size of runs

def group_token_runs(tokens:list[int]) -> list[list[int]]:
    curr_token:int = tokens[0]
    curr_run:list[int] = []
    result:list[list[int]] = []
    for token in tokens:
        if token == curr_token:
            curr_run.append(token)
        else:
            result.append(curr_run)
            curr_run = [token]
            curr_token = token

    return result

runs = group_token_runs(ws_removed)
run_lengths = list(map(lambda l: len(l), runs))

lengths_by_token = {token: [len(run) for run in runs if token in run] for token in tokens}



In [71]:
avg_lengths_by_token = {k: {"mu": mean(v), "sigma": stdev(v)} for k,v in lengths_by_token.items()}
avg_lengths_by_token

{0: {'mu': 1.364864864864865, 'sigma': 0.7688155687475454},
 1: {'mu': 1.6101694915254237, 'sigma': 1.2036861366672416},
 2: {'mu': 1.3333333333333333, 'sigma': 0.816496580927726},
 3: {'mu': 1.0222222222222221, 'sigma': 0.14907119849998599},
 4: {'mu': 1.8269230769230769, 'sigma': 1.3963699976859305},
 5: {'mu': 1.5806451612903225, 'sigma': 1.02548179078217},
 6: {'mu': 3.1639344262295084, 'sigma': 2.067045620113019},
 7: {'mu': 1.4, 'sigma': 0.6454972243679028},
 8: {'mu': 2.8333333333333335, 'sigma': 1.1690451944500122},
 9: {'mu': 1.75, 'sigma': 1.0645812948447542},
 10: {'mu': 45, 'sigma': 34.83646002923404}}

In [None]:
# looking into these values for all images
tokens_to_string = lambda l: " ".join(map(lambda i: str(i), l))
