Imports

In [None]:
import aiohttp
import asyncio
from PIL import Image
from io import BytesIO
import pandas as pd

Load URLs data into DataFrame

In [None]:
# load URLs - drop NaN image_url rows
urls = pd.read_csv('Parser_ImageSize - feed.csv')
urls = urls[urls['image_url'].notna()]
urls

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,575x860
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,
...,...,...
46883,https://data.sanitino.eu/PRODUCT-113844/9786df...,
46884,https://data.sanitino.eu/PRODUCT-20240/4a457da...,
46885,https://data.sanitino.eu/PRODUCT-47765/58a1a3d...,
46886,https://data.sanitino.eu/PRODUCT-56372/764c369...,


Function that gets size of given image

In [None]:
async def get_size(image_url):
    # Creates aiohttp.ClientSession
    # Gets response from given url
    # If status code is 200 - format image into Pillow Image object to get size
    # Change dataframe value of certain image_url's size

    async with aiohttp.ClientSession() as session:
        async with session.get(image_url) as response:
            if response.status == 200 :
                image_raw = await response.read()
                image = Image.open(BytesIO(image_raw))
                urls.loc[urls['image_url'] == image_url, ['SIZE']] = f'{image.width}x{image.height}'
            else:
                print('Image not found:', image_url)

Limit concurrency to a certain number

In [None]:
# https://stackoverflow.com/a/61478547/20774848

async def gather_with_concurrency(n, *coros):
    semaphore = asyncio.Semaphore(n)

    async def sem_coro(coro):
        async with semaphore:
            return await coro
    return await asyncio.gather(*(sem_coro(c) for c in coros))

Execute main function concurrently (100 packs)

In [None]:
async def main(df):
    await gather_with_concurrency(100, *[get_size(url)  for url in df['image_url']])
    print(df)

In [None]:
await main(urls)

Results

In [None]:
urls

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,1080x1614
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,1080x1080
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,1080x1614
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,1080x1080
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,1080x1614
...,...,...
46883,https://data.sanitino.eu/PRODUCT-113844/9786df...,2301x1080
46884,https://data.sanitino.eu/PRODUCT-20240/4a457da...,1621x1080
46885,https://data.sanitino.eu/PRODUCT-47765/58a1a3d...,1080x1218
46886,https://data.sanitino.eu/PRODUCT-56372/764c369...,1080x1080


In [None]:
# 1028 Not Found Images
urls['SIZE'].isna().sum()

1028

Export DF into XLSX file

In [None]:
# Replace NaN values with "Image Not Found" and save dataframe to xlsl file
urls['SIZE'].fillna("Image Not Found", inplace=True)
urls.to_excel('Level1.xlsx', index=False)