In [1]:
import asyncio
import aiohttp
import pandas as pd
from PIL import Image
import io

from config import SHEET_ID, SCOPES, GOOGLE_APPLICATION_CREDENTIALS
from utils import write_to_worksheet, authenticate

Fetch data from the worksheet and convert it into a pandas DataFrame

In [2]:
client = authenticate(credentials=GOOGLE_APPLICATION_CREDENTIALS, scopes=SCOPES)
worksheet = client.open_by_key(SHEET_ID).sheet1
data = worksheet.get_all_values()
df = pd.DataFrame(data[1:], columns=data[0])
df.head(100)

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,575x860
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,
...,...,...
95,https://data.sanitino.eu/PRODUCT-29321/071b923...,
96,https://data.sanitino.eu/PRODUCT-4723/72ea8172...,
97,https://data.sanitino.eu/PRODUCT-29317/f72de61...,
98,https://data.sanitino.eu/PRODUCT-29328/4bedb8f...,


Check for missing values

In [3]:
df.isna().sum()

image_url    0
SIZE         0
dtype: int64

Check for duplicates

In [4]:
df.duplicated().sum()

np.int64(19)

Cleaning rows with "nan" and NaN values in the "image_url" column

In [5]:
nan = df[df["image_url"] == "nan"]
df = df.drop(nan.index)
df.drop_duplicates()

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,575x860
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,
...,...,...
46883,https://data.sanitino.eu/PRODUCT-113844/9786df...,
46884,https://data.sanitino.eu/PRODUCT-20240/4a457da...,
46885,https://data.sanitino.eu/PRODUCT-47765/58a1a3d...,
46886,https://data.sanitino.eu/PRODUCT-56372/764c369...,


Create asynchronous image resolution fetching

In [6]:
async def get_single_image_resolution(
        client: aiohttp.ClientSession,
        url: str
) -> str | None:
    async with client.get(url) as response:

        if response.status != 200:

            return

        content = await response.read()
        image = Image.open(io.BytesIO(content))
        resolution = f"{image.height}x{image.width}"

        return resolution


async def get_image_resolutions(
        client: aiohttp.ClientSession,
        df: pd.DataFrame
) -> tuple[str | None]:
    tasks = [get_single_image_resolution(client, url) for url in df["image_url"]]

    return await asyncio.gather(*tasks)


async def set_image_resolutions(dataframe: pd.DataFrame) -> None:
    async with aiohttp.ClientSession() as client:
        dataframe["SIZE"] = await get_image_resolutions(client, dataframe)


await set_image_resolutions(df)

In [7]:
df.head()

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,1614x1080
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,1080x1080
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,1614x1080
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,1080x1080
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,1614x1080


Checking if there are any missing values

In [8]:
df.isna().sum()

image_url       0
SIZE         1031
dtype: int64

Cleaning rows with NaN values in the "SIZE" column

In [9]:
df.dropna(subset=["SIZE"], inplace=True)

Write the dataframe to the worksheet

In [10]:
write_to_worksheet(worksheet, df, overwrite=True)