In [None]:
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

try: import timm
except ModuleNotFoundError:
    !pip install -Uq "timm==0.6.2.dev0"

try: import tensorflow
except ModuleNotFoundError:
    !pip install -Uq tensorflow

try: import torcheval
except ModuleNotFoundError:
    !pip install -Uq torcheval

try: import huggingface_hub
except ModuleNotFoundError:
    !pip install -Uq huggingface_hub

In [1]:
from fastkaggle import *

In [None]:
# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not iskaggle and not cred_path.exists():
    creds = ''
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [None]:
comp = 'tpu-getting-started'

path = setup_comp(comp, install='fastai tensorflow "timm==0.6.2.dev0"')
classes = ["pink primrose","hard-leaved pocket orchid","canterbury bells","sweet pea","wild geranium","tiger lily","moon orchid","bird of paradise","monkshood","globe thistle","snapdragon","colt's foot","king protea","spear thistle","yellow iris","globe-flower","purple coneflower","peruvian lily","balloon flower","giant white arum lily","fire lily","pincushion flower","fritillary","red ginger","grape hyacinth","corn poppy","prince of wales feathers","stemless gentian","artichoke","sweet william","carnation","garden phlox","love in the mist","cosmos","alpine sea holly","ruby-lipped cattleya","cape flower","great masterwort","siam tulip","lenten rose","barberton daisy","daffodil","sword lily","poinsettia","bolero deep blue","wallflower","marigold","buttercup","daisy","common dandelion","petunia","wild pansy","primula","sunflower","lilac hibiscus","bishop of llandaff","gaura","geranium","orange dahlia","pink-yellow dahlia","cautleya spicata","japanese anemone","black-eyed susan","silverbush","californian poppy","osteospermum","spring crocus","iris","windflower","tree poppy","gazania","azalea","water lily","rose","thorn apple","morning glory","passion flower","lotus","toad lily","anthurium","frangipani","clematis","hibiscus","columbine","desert-rose","tree mallow","magnolia","cyclamen ","watercress","canna lily","hippeastrum ","bee balm","pink quill","foxglove","bougainvillea","camellia","mallow","mexican petunia","bromelia","blanket flower","trumpet creeper","blackberry lily","common tulip","wild rose"]

In [None]:
path

In [None]:
import timm
import functools
import itertools
from fastai.vision.all import *

path.ls()

In [None]:
data_paths = list(filter(lambda x: str(x) != "tpu-getting-started/sample_submission.csv", path.ls()))
data_paths = [path/'tfrecords-jpeg-331x331']
data_paths = functools.reduce(lambda acc, path: list(itertools.chain(acc, (path/'train').ls(), (path/'val').ls())), data_paths, [])

In [None]:
import tensorflow as tf

# https://www.tensorflow.org/tutorials/load_data/tfrecord#reading_a_tfrecord_file_in_python
def load_tfrecs(data_paths, is_test=False):
    data = []
    raw_dataset = tf.data.TFRecordDataset(data_paths)

    for record in raw_dataset:
        example = tf.train.Example()
        example.ParseFromString(record.numpy())
        datum = [
            None if is_test else example.features.feature['class'].int64_list.value[0],
            example.features.feature['image'].bytes_list.value[0],
            example.features.feature['id'].bytes_list.value[0],
        ]
        data.append(datum)

    return data

data = load_tfrecs(data_paths)

len(data)

In [None]:
img = PILImage.create(data[0][1])
print(img.size)
img.to_thumb(128)

In [None]:
from fastcore.parallel import *

def f(o): return PILImage.create(o[1]).size
sizes = parallel(f, data, n_workers=8)
pd.Series(sizes).value_counts()

In [None]:
def get_items(data, *args, **kwargs): return data

get_x = lambda entry: PILImage.create(io.BytesIO(entry[1]))
get_y = lambda entry: entry[0]

In [None]:
from torcheval.metrics.functional import multiclass_f1_score
import gc

res = 320,320
models = {
    'convnext_large_in22k': {
        (Resize(res), 224),
    }, 'vit_large_patch16_224': {
        (Resize(res), 224),
    }, 'swinv2_large_window12_192_22k': {
        (Resize(res), 192),
    }, 'swin_large_patch4_window7_224': {
        (Resize(res), 224),
    }
}

tta_res = []

test_paths = [(path/'tfrecords-jpeg-331x331/test').ls()]
test_data = load_tfrecs(test_paths, is_test=True)

def train(arch, size, item=Resize(512, method='squish')):
    block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                      get_items=get_items,
                      get_x=get_x,
                      get_y=get_y,
                      splitter=IndexSplitter(range(12753, len(data))),
                      item_tfms=item,
                      batch_tfms=[*aug_transforms(size=size, min_scale=0.75)])
    dls = block.dataloaders(data, bs=16)

    learn = vision_learner(dls, arch, metrics=multiclass_f1_score, path='.', cbs=GradientAccumulation(64)).to_fp16()
    learn.fine_tune(5, 0.001)

    return learn.tta(dl=dls.test_dl(test_data))

for arch,details in models.items():
    for item,size in details:
        print('---',arch)
        print(size)
        print(item.name)
        tta_res.append(train(arch, size, item=item))
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
tta_prs = first(zip(*tta_res))
avg_pr = torch.stack(tta_prs).mean(0)
avg_pr.shape

In [None]:
sample_submission = pd.read_csv(path/'sample_submission.csv')
sample_submission

In [None]:
# torch.save(learn.model.state_dict(), '01.pth')
# from google.colab import files
# files.download('model_state_dict.pth')
# learn = vision_learner(dls, 'convnext_small_in22k', metrics=multiclass_f1_score, path='.').to_fp16()
# learn.model.load_state_dict(torch.load('01.pth'))

# learn = vision_learner(dls, 'convnext_small_in22k', metrics=multiclass_f1_score, path='.').to_fp16()
# learn.model.load_state_dict(torch.load('01.pth', map_location=torch.device('cpu')))

In [None]:
idxs = avg_pr.argmax(dim=1)

In [None]:
idxs

In [None]:
learn.dls.vocab

In [None]:
vocab = np.array(learn.dls.vocab)
results = pd.Series(vocab[idxs], name="label")
results

In [None]:
ids = pd.Series([datum[2].decode('utf-8') for datum in test_data], name="id")
ids

In [None]:
submission = pd.DataFrame({'id': ids, 'label': results})
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
!head submission.csv

In [None]:
# if not iskaggle:
#    from kaggle import api
#    api.competition_submit_cli('submission.csv', 'petals-to-the-metal-01', comp)

In [None]:
if not iskaggle:
    push_notebook('alexchalk', 'petals-to-the-metal-submission-03',
                  title='Petals to the Metal Submission 03',
                  file='03.ipynb',
                  competition=comp, private=True, gpu=False)