In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2023-08-12 13:40:42.729582: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# open raw data
file_path = "/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/rawoutput.csv"
df_ori= pd.read_csv(filepath_or_buffer=file_path, sep='\t', index_col=0)
df = df_ori.copy()

In [3]:
# remove unnecessary data
cols_to_drop = ['urlKey', 'id', 'name', 'description', 'model', 'market', 'condition', 'productCategory', 'listingType', 'browseVerticals', 'favorite', 'variants']
df = df.drop(cols_to_drop, axis=1)

# remove rows with missing data
df = df.dropna(axis=0, how='any')

In [4]:
# clean to extract image url and retail price
def cleanImageUrl(url):
    return (url.split('.jpg')[0]) + '.jpg'

df['imageUrl'] = df['media'].apply(lambda x: (json.loads(x.replace("\'", "\"")))["thumbUrl"])
df['imageUrl'] = df['imageUrl'].apply(cleanImageUrl)

df['retailPrice'] = df['productTraits'].apply(lambda x: (json.loads(x.replace("\'", "\""))[0]["value"]))
df = df.drop(['media', 'productTraits'], axis=1)

In [5]:
import asyncio
import aiohttp
import cv2
import numpy as np
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio
nest_asyncio.apply()

async def openImage(url, session):
    async with session.get(url) as response:
        image = await response.read()
        image = np.asarray(bytearray(image), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_AREA)
        image = image/255.0
        return image

async def process_images(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            task = asyncio.create_task(openImage(url, session))
            tasks.append(task)
        results = await tqdm_asyncio.gather(*tasks)
        return results

async def main(df):
    img_urls = df['imageUrl'].tolist()
    result_images = await process_images(img_urls)
    return result_images

if __name__ == "__main__":
    img_df = asyncio.run(main(df))


100%|██████████| 1040/1040 [00:33<00:00, 30.84it/s]


In [6]:
np.save("/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/cleaned/images", img_df)

In [9]:
prices_df = np.asarray(df['last sale'])
np.save("/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/cleaned/prices", prices_df)

In [10]:
df

Unnamed: 0,title,brand,gender,last sale,imageUrl,retailPrice
0,Mihara Yasuhiro Hank OG Sole Canvas Low Black,Mihara Yasuhiro,men,323,https://images.stockx.com/images/Mihara-Yasuhi...,220
1,Mihara Yasuhiro Blakey OG Sole Canvas Low Black,Mihara Yasuhiro,men,278,https://images.stockx.com/images/Mihara-Yasuhi...,220
2,Louis Vuitton LV Trainer White Black,Louis Vuitton,men,1130,https://images.stockx.com/images/Louis-Vuitton...,1220
3,Jordan 1 Mid Sneaker School Game Winner (GS),Jordan,child,94,https://images.stockx.com/images/Air-Jordan-1-...,120
4,Burberry Regis Archive Beige White,Burberry,men,456,https://images.stockx.com/images/Burberry-Regi...,770
...,...,...,...,...,...,...
1035,Dior B23 Low Top Logo Oblique,Dior,men,980,https://images.stockx.com/images/Dior-B23-Low-...,950
1036,Jordan 1 Mid Sneakersnstuff 20th Anniversary,Jordan,men,129,https://images.stockx.com/images/Air-Jordan-1-...,140
1037,Nike Air Max 1 SNKRS Day Brown,Nike,men,277,https://images.stockx.com/images/Nike-Air-Max-...,140
1038,Lanvin Curb Sneaker White Multicolor,Lanvin,men,399,https://images.stockx.com/images/Lanvin-Curb-S...,890
