In [1]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
import os

#request, pandas, pillow, tqdm


In [2]:
df =pd.read_csv("h-and-m-personalized-fashion-recommendations/articles.csv", dtype=str)
df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


## Clean Data

In [3]:
df_dropped = df.drop(columns=['product_code', 
                       'product_type_no', 
                       'graphical_appearance_no', 
                       'graphical_appearance_name', 
                       'colour_group_code', 
                       'perceived_colour_value_id', 
                       'perceived_colour_value_name',
                       'perceived_colour_master_id',
                       'colour_group_name',
                       'department_no',
                       'department_name',
                       'index_code',
                       'index_group_no',
                       'section_no',
                       'section_name',
                       'garment_group_no',
                       'garment_group_name',
                       "detail_desc",
                       "index_name"
                       
                       ])
df_dropped.rename(columns={"perceived_colour_master_name": "color"}, inplace=True)
df_dropped.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,color,index_group_name
0,108775015,Strap top,Vest top,Garment Upper body,Black,Ladieswear
1,108775044,Strap top,Vest top,Garment Upper body,White,Ladieswear
2,108775051,Strap top (1),Vest top,Garment Upper body,White,Ladieswear
3,110065001,OP T-shirt (Idro),Bra,Underwear,Black,Ladieswear
4,110065002,OP T-shirt (Idro),Bra,Underwear,White,Ladieswear


In [4]:
print("items before dropping: ", len(df))

# remove items of unkown category
df_dropped = df_dropped[~df_dropped["product_group_name"].isin(["Unknown"])]

# We also drop products types that do not occur very frequently in this subset of data
most_frequent_product_types = [k for k, v in dict(Counter(df_dropped["product_type_name"].tolist())).items() if v > 10]
df_dropped = df_dropped[df_dropped["product_type_name"].isin(most_frequent_product_types)]

#Reset index
df_dropped= df_dropped.reset_index().drop(columns="index")
print("items after dropping: ", len(df_dropped))

items before dropping:  105542
items after dropping:  105277


### Remove items with no image

In [5]:
dir = "h-and-m-personalized-fashion-recommendations/images/"


valid_images = []
for index in tqdm(range(len(df_dropped))):
    subdir = f"{df_dropped['article_id'][index][0:3]}/{df_dropped['article_id'][index]}.jpg"
    if os.path.isfile(dir+subdir):
        valid_images.append(True)
    else:
        valid_images.append(False)

df_dropped["valid_image"] = valid_images
df_dropped=df_dropped[df_dropped["valid_image"]==True]


#Reset index
df_dropped.drop(columns="valid_image", inplace=True)
df_dropped= df_dropped.reset_index().drop(columns="index")
print("items after dropping: ", len(df_dropped))

  0%|          | 0/105277 [00:00<?, ?it/s]

items after dropping:  104835


## add links to images

In [8]:
"""URL = "https://d11p8vtjlacpl4.cloudfront.net/kaggle-hm-images/{}/{}.jpg"
links = [URL.format(df_dropped['article_id'][index][0:3], df_dropped['article_id'][index]) for index in range(len(df_dropped))]
df_dropped["image"] = links
df_dropped"""

#Now done on client side

'URL = "https://d11p8vtjlacpl4.cloudfront.net/kaggle-hm-images/{}/{}.jpg"\nlinks = [URL.format(df_dropped[\'article_id\'][index][0:3], df_dropped[\'article_id\'][index]) for index in range(len(df_dropped))]\ndf_dropped["image"] = links\ndf_dropped'

## Add href to h&m page

In [9]:
"""item_URL = "https://www2.hm.com/en_gb/productpage.{}.html"

item_links = []
for index in tqdm(range(len(df_dropped))):
    item_links.append(item_URL.format(df_dropped['article_id'][index]))
df_dropped["item_link"] = item_links
df_dropped"""


#Now done on client side

'item_URL = "https://www2.hm.com/en_gb/productpage.{}.html"\n\nitem_links = []\nfor index in tqdm(range(len(df_dropped))):\n    item_links.append(item_URL.format(df_dropped[\'article_id\'][index]))\ndf_dropped["item_link"] = item_links\ndf_dropped'

In [10]:
df_dropped

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,color,index_group_name
0,0108775015,Strap top,Vest top,Garment Upper body,Black,Ladieswear
1,0108775044,Strap top,Vest top,Garment Upper body,White,Ladieswear
2,0108775051,Strap top (1),Vest top,Garment Upper body,White,Ladieswear
3,0110065001,OP T-shirt (Idro),Bra,Underwear,Black,Ladieswear
4,0110065002,OP T-shirt (Idro),Bra,Underwear,White,Ladieswear
...,...,...,...,...,...,...
104830,0953450001,5pk regular Placement1,Socks,Socks & Tights,Black,Menswear
104831,0953763001,SPORT Malaga tank,Vest top,Garment Upper body,Black,Ladieswear
104832,0956217002,Cartwheel dress,Dress,Garment Full body,Black,Ladieswear
104833,0957375001,CLAIRE HAIR CLAW,Hair clip,Accessories,Black,Divided


## Save to csv

In [11]:
df_dropped.to_csv("H&M_items.csv", index=False)

In [12]:
import sys
print(f"{sys.getsizeof(df_dropped)/1000000} mb")


42.526027 mb


In [13]:
df_dropped.shape

(104835, 6)