<a href="https://www.kaggle.com/code/tensorkelechi/moondream-data?scriptVersionId=180367144" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import requests
import os
import pandas as pd
import re
from PIL import Image as pillow_image
from multiprocessing import Pool
from datasets import load_dataset
from tqdm.auto import tqdm
from typing import List
import multiprocessing
import queue
import io
import concurrent
from concurrent.futures import ThreadPoolExecutor

In [2]:
image_folder = 'moondream_images'
os.mkdir(image_folder)

out_folder = os.path.join(os.getcwd(), image_folder)

sample_url = 'https://miro.medium.com/v2/resize:fit:720/format:webp/1*zX-rizZKXXg7Ju-entot9g.png'

In [3]:

def format_filename(filename):
    name, ext = os.path.splitext(filename)
    pattern = r"[^\w\-]"
    
    formatted_filename = re.sub(pattern, "", name)

    return formatted_filename + ext

In [4]:
def download_img(url):
    try:
        response_file = requests.get(url)
        out_file = format_filename(os.path.basename(url))
        out_path = os.path.join(out_folder, out_file)

        with open(out_path, "wb") as image_file:
            image_file.write(response_file.content)
            yield image_file

#             print(f"{out_file} downloaded")
            
    except Exception as e:
        print(f"error: {e}")
        

In [5]:
def get_moondream_data(split_size: int):
    moondream_dataset = load_dataset("isidentical/moondream2-coyo-5M-captions")
    md_data = moondream_dataset["train"][:split_size]  # type: ignore
    image_urls = md_data["url"]  # type: ignore
    descriptions = md_data["moondream2_caption"]  # type: ignore

    count = 0

    for url, desc in tqdm(zip(image_urls, descriptions), total=split_size):
        url = str(url)
        
        if url.endswith(('jpeg', 'jpg', 'png')):
            image_dl = download_img(url)
            caption = desc.lower()
            file_name = format_filename(os.path.basename(url))
            
            count += 1
            
        else:
            continue

        if image_dl is not None:
            yield (image_dl, file_name, caption)
            
    print(f'{count} images downloaded')


In [6]:
%%time

q, k, v = zip(*get_moondream_data(2000))

Downloading readme:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 301M/301M [00:01<00:00, 207MB/s]
Downloading data: 100%|██████████| 301M/301M [00:01<00:00, 235MB/s]
Downloading data: 100%|██████████| 301M/301M [00:01<00:00, 212MB/s]
Downloading data: 100%|██████████| 302M/302M [00:01<00:00, 200MB/s]
Downloading data: 100%|██████████| 301M/301M [00:01<00:00, 208MB/s]


Generating train split:   0%|          | 0/5005590 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

1416 images downloaded
CPU times: user 18.1 s, sys: 8.02 s, total: 26.1 s
Wall time: 29.1 s


In [7]:
# def save_images(img_generator):
#     for image_file in img_generator:
#         try:
#             image_buffer = io.BytesIO(image_file.read())
#             image = pillow_image.open(image_buffer)
#             image_path = os.path.join("images", f"{image_file.name}.png")
#             image.save(image_path)
#             print(f"{image_file.name} saved successfully.")
#         except Exception as e:
#             print(f"Error {e}")

#         finally:
#             image_file.close()
        
# #         return image_file

In [8]:
def save_images(file_generator):
    for image_file in file_generator:
        image_path = os.path.join(out_folder, image_file.name)
        
        with open(image_path, "rb") as image_file:
            image_buffer = io.BytesIO(image_file.read())
            
        with open(image_path, "wb") as f:
            f.write(image_buffer.getbuffer())

        image_file.close()

In [9]:
def save_images_parallel(file_generator, max_workers=5):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(save_images, image_file) for image_file in file_generator]
        for _ in concurrent.futures.as_completed(futures):
            pass

In [10]:
%%time

saved_images = [save_images_parallel(file_gen) for file_gen in q]

error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
error: HTTPConnectionPool(host='www.short-haircut.com', port=80): Max retries exceeded with url: /wp-content/uploads/2017/03/17-Short-Pink-Hair-20170343751.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x78dea05540d0>: Failed to resolve 'www.short-haircut.com' ([Errno -2] Name or service not known)"))
error: HTTPSConnectionPool(host='safarihub.com', port=443): Max retries exceeded with url: /wp-content/uploads/2021/01/1-NOTHERN-CIRCUIT-ROUTE.png (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1007)')))
error: HTTPSConnectionPool(host='www.accommodationdaintree.com.au', port=443): Max retries exceeded with url: /img-central/att/39/glo_39906_230720071610ilk21.jpeg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x78dea05dc100>, 'Connection to www.accommodationdaintree.com.a

In [11]:
%%time

#qx = [save_images(file_gen) for file_gen in q]

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs


In [12]:
len(q), len(v), len(k)


(1416, 1416, 1416)

In [13]:
# qx[1].name

In [14]:
csv_path = 'moondream2.csv'

def moondream_csv(path: List, desc: List):
    print('Writing to csv..')
    
    keys = ['image_path', 'caption']
    
    md_dict = {
        'image_path': path,
        'caption': desc
    }
    
    moondream_df = pd.DataFrame(md_dict)
    
    moondream_df.to_csv(csv_path, index=False)
    
    print('Csv transfer complete...')


In [15]:
moondream_csv(k, v)

Writing to csv..
Csv transfer complete...


In [16]:
chandra = pd.read_csv(f'/kaggle/working/{csv_path}')

chandra

Unnamed: 0,image_path,caption
0,369.jpg,two young girls crouched and gathered around a...
1,0001068_vd069-36-roses_600.jpeg,a decorative floral arrangement featuring a bo...
2,NIK_9001.jpg,"a modern, minimalist dining area featuring a w..."
3,f17f2fd0f8a614ff47cfbc77ed.jpg,a collection of necklaces with various enamel ...
4,P1002104_thumb.png,a vibrant landscape painting depicting a rocky...
...,...,...
1411,alpay-tonga-Sp82b8oJYyc-unsplash-300x300.jpg,"a close-up view of an older woman's face, focu..."
1412,wooden-table-front-abstract-blurred-background...,"a wooden table in front of an abstract, blurre..."
1413,82a941b4-4642-4682-9754-d0eedd142c6a_doEV1NHjp...,"a silver engagement ring featuring a large, br..."
1414,7280783_18.jpg,a group of people sitting on cushions in a dim...


In [17]:
# chandra.dropna()

In [18]:
print('Kaggle moondream porting complete')

Kaggle moondream porting complete


In [19]:
# len(qx)

In [20]:
im_folder = '/kaggle/working/moondream_images'
x = os.listdir('/kaggle/working/moondream_images')
paths = chandra['image_path']
caps = chandra['caption']
len(x)

1212

In [21]:
# caps

In [22]:
# import matplotlib.pyplot as plt
# import numpy as np
# # # plt.imshow(pillow_image.open(qx[1]))
# # # imbuff = io.BytesIO(qx[1])
# # #     plt.imshow(np.array(pillow_image.open(os.path.join(im_folder, x[z]))))

# for path, cap in zip(paths, caps):
#     plt.imshow(np.array(pillow_image.open(os.path.join(im_folder, path))))
#     print(cap)
#     plt.show()