In [1]:
import collections
import random
import os
import time
import json
from PIL import Image
import io
import urllib
import uuid
from concurrent.futures import ThreadPoolExecutor
from functools import partial

import numpy as np
from tqdm import tqdm
from datasets import load_dataset, load_from_disk
from datasets.utils.file_utils import get_datasets_user_agent
import matplotlib.pyplot as plt
import torch
import shutil
from torch.utils.data import DataLoader

Only required login once.

In [2]:
from huggingface_hub import notebook_login
notebook_login()

In [3]:
# Check cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
def fetch_single_image(image_url, timeout=None, retries=0):
    request = urllib.request.Request(
        image_url,
        data=None,
        headers={"user-agent": USER_AGENT},
    )
    with urllib.request.urlopen(request, timeout=timeout) as req:
        if 'png' in image_url:
            # for .png file
            png = Image.open(io.BytesIO(req.read())).convert('RGBA')
            png.load() # required for png.split()
            background = Image.new("RGB", png.size, (255, 255, 255))
            background.paste(png, mask=png.split()[3]) # 3 is the alpha channel
            image_id = str(uuid.uuid4())
            image_path = path+"/" + image_id + ".jpg"
            background.save(image_path, 'JPEG', quality=80)
        else:
            # for .jpg file
            image = Image.open(io.BytesIO(req.read()))
            image_id = str(uuid.uuid4())
            image_path = path+"/"  + image_id + ".jpg"
            image.save(image_path)
    return image_path

def fetch_images(batch, num_threads, timeout=None, retries=3):
    fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        batch["image_path"] = list(executor.map(fetch_single_image_with_args, batch["image_url"]))
    return batch

# Add the relevant ISO code for the language you want to work with.
iso639_3_letter_code = ["hau", "tha", "kir"]
for code in iso639_3_letter_code:
    if 'bloom-captioning-{}'.format(code) in os.listdir():
        print('bloom-captioning-{} already exist!'.format(code))
        continue
    else:
        # Download the language specific dataset from HF.
        dataset = load_dataset("sil-ai/bloom-captioning", code, 
                               use_auth_token=True, download_mode='force_redownload')
        path = 'images_'+code
        num_threads = 20
        #shutil.rmtree(path, ignore_errors=True)
        os.mkdir(path)
        USER_AGENT = get_datasets_user_agent()   
        
        dataset = dataset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads})
        dataset.save_to_disk('bloom-captioning-{}'.format(code))
        print('Successful download data for '+code)

bloom-captioning-hau already exist!
bloom-captioning-tha already exist!
bloom-captioning-kir already exist!


In [5]:
read_code = 'hau' # choose from "hau", "tha", "kir"
dataset = load_from_disk('bloom-captioning-'+read_code)
dataset

DatasetDict({
    test: Dataset({
        features: ['image_id', 'image_url', 'caption', 'story_id', 'album_id', 'license', 'original_bloom_language_tag', 'index_in_story', 'image_path'],
        num_rows: 52
    })
    validation: Dataset({
        features: ['image_id', 'image_url', 'caption', 'story_id', 'album_id', 'license', 'original_bloom_language_tag', 'index_in_story', 'image_path'],
        num_rows: 52
    })
    train: Dataset({
        features: ['image_id', 'image_url', 'caption', 'story_id', 'album_id', 'license', 'original_bloom_language_tag', 'index_in_story', 'image_path'],
        num_rows: 1761
    })
})

In [6]:
training_set = dataset['train']
val_set = dataset['validation']
test_set = dataset['test']

In [7]:
print('rows for training = {}\nrows for val = {}\nrows for test = {}\n'.format(len(training_set),len(val_set),len(test_set)))

rows for training = 1761
rows for val = 52
rows for test = 52



In [8]:
train_dataloader = DataLoader(training_set, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=64, shuffle=True)
test_dataloader = DataLoader(val_set, batch_size=64, shuffle=True)

In [9]:
!git clone https://github.com/microsoft/GenerativeImage2Text.git
!cd GenerativeImage2Text

Cloning into 'GenerativeImage2Text'...
