In [2]:
import pandas as pd
import numpy as np
import json
import requests
import cv2
from tqdm.auto import tqdm
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# open raw data
file_path = "/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/rawoutput.csv"
df_ori= pd.read_csv(filepath_or_buffer=file_path, sep='\t', index_col=0)
df = df_ori.copy()

In [4]:
# remove unnecessary data
cols_to_drop = ['urlKey', 'id', 'name', 'description', 'model', 'market', 'condition', 'productCategory', 'listingType', 'browseVerticals', 'favorite', 'variants']
df = df.drop(cols_to_drop, axis=1)

# remove rows with missing data
df = df.dropna(axis=0, how='any')

In [5]:
# clean to extract image url and retail price
def cleanImageUrl(url):
    return (url.split('.jpg')[0]) + '.jpg'

df['imageUrl'] = df['media'].apply(lambda x: (json.loads(x.replace("\'", "\"")))["thumbUrl"])
df['imageUrl'] = df['imageUrl'].apply(cleanImageUrl)

df['retailPrice'] = df['productTraits'].apply(lambda x: (json.loads(x.replace("\'", "\""))[0]["value"]))
df = df.drop(['media', 'productTraits'], axis=1)

In [20]:
tqdm.pandas()

def openImage(url):
    response = requests.get(url, stream=True).raw
    image = np.asarray(bytearray(response.read()), dtype="uint8")
    return cv2.imdecode(image, cv2.IMREAD_COLOR)

img_df = pd.DataFrame(df['imageUrl'].progress_apply(openImage))

100%|██████████| 1040/1040 [02:09<00:00,  8.01it/s]


In [14]:
import asyncio
import aiohttp
import cv2
import numpy as np
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio
nest_asyncio.apply()

async def openImage(url, session):
    async with session.get(url) as response:
        image = await response.read()
        image = np.asarray(bytearray(image), dtype="uint8")
        return cv2.imdecode(image, cv2.IMREAD_COLOR)

async def process_images(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            task = asyncio.create_task(openImage(url, session))
            tasks.append(task)
        results = await tqdm_asyncio.gather(*tasks)
        return results

async def main(df):
    img_urls = df['imageUrl'].tolist()
    result_images = await process_images(img_urls)
    return result_images

if __name__ == "__main__":
    asyncio.run(main(df))


100%|██████████| 1040/1040 [00:23<00:00, 44.51it/s]
