In [None]:
import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm
from tqdm import tqdm

In [None]:
# open raw data
file_path = "/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/raw/5000_output.csv"
df_ori= pd.read_csv(filepath_or_buffer=file_path, sep='\t', index_col=0)
df = df_ori.copy()

In [None]:
# remove unnecessary data
cols_to_drop = ['urlKey', 'id', 'name', 'description', 'model', 'market', 'condition', 'productCategory', 'listingType', 'browseVerticals', 'favorite', 'variants']
df = df.drop(cols_to_drop, axis=1)

# remove rows with missing data
df = df.dropna(axis=0, how='any')

In [None]:
# clean to extract image url and retail price
def cleanImageUrl(url):
    return (url.split('.jpg')[0]) + '.jpg'

df['imageUrl'] = df['media'].apply(lambda x: (json.loads(x.replace("\'", "\"")))["thumbUrl"])
df['imageUrl'] = df['imageUrl'].apply(cleanImageUrl)

df['retailPrice'] = df['productTraits'].apply(lambda x: (json.loads(x.replace("\'", "\""))[0]["value"]))
df = df.drop(['media', 'productTraits'], axis=1)

In [None]:
import asyncio
import aiohttp
import cv2
import numpy as np
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio
nest_asyncio.apply()

async def openImage(url, session):
    async with session.get(url) as response:
        image = await response.read()
        image_np = np.asarray(bytearray(image), dtype="uint8")
        image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
        image_np = None
        image = cv2.resize(image, (128, 128), interpolation=cv2.INTER_AREA)
        return image

async def process_images(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            task = asyncio.create_task(openImage(url, session))
            tasks.append(task)
        results = await tqdm_asyncio.gather(*tasks)
    return results

async def main(df):
    img_urls = df['imageUrl'].tolist()
    result_images = await process_images(img_urls)
    return result_images

if __name__ == "__main__":
    img_df = asyncio.run(main(df))


In [None]:
np.save("/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/cleaned/5000_images", img_df)

In [None]:
prices_df = np.asarray(df['last sale'])
np.save("/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/cleaned/5000_prices", prices_df)