# DATASET PREPARATION FOR VAESTA PROJECT

Install required packages (runs only if missing)

In [1]:
!pip install pandas numpy matplotlib



Basic imports

In [2]:
import pandas as pd
import numpy as np
from PIL import Image
import os

In [7]:
DATA_ROOT = "data/DeepFashion"

In [11]:
fabric_file = DATA_ROOT + "/labels/texture/fabric_ann.txt"

fabric_raw = pd.read_csv(fabric_file, sep="\s+", header=None, names=["image","upper_code","lower_code","outer_code"])

fabric_map = {0:"denim",1:"cotton",2:"leather",3:"furry",4:"knitted",5:"chiffon",6:"other",7:"NA"}

fabric_raw["upper_fabric"] = fabric_raw["upper_code"].map(fabric_map)
fabric_raw["lower_fabric"] = fabric_raw["lower_code"].map(fabric_map)
fabric_raw["outer_fabric"] = fabric_raw["outer_code"].map(fabric_map)

fabric = fabric_raw[["image","upper_fabric","lower_fabric","outer_fabric"]]
fabric.head()

  fabric_raw = pd.read_csv(fabric_file, sep="\s+", header=None, names=["image","upper_code","lower_code","outer_code"])


Unnamed: 0,image,upper_fabric,lower_fabric,outer_fabric
0,MEN-Denim-id_00000080-01_7_additional.jpg,cotton,cotton,
1,MEN-Denim-id_00000089-01_7_additional.jpg,cotton,cotton,
2,MEN-Denim-id_00000089-02_7_additional.jpg,cotton,cotton,
3,MEN-Denim-id_00000089-03_7_additional.jpg,cotton,cotton,
4,MEN-Denim-id_00000089-04_7_additional.jpg,denim,cotton,


In [13]:
pattern_file = DATA_ROOT + "/labels/texture/pattern_ann.txt"
pattern_raw = pd.read_csv(pattern_file, sep="\s+", header=None, names=["image","upper_code","lower_code","outer_code"])

pattern_map = {0:"floral", 1:"graphic", 2:"striped", 3:"pure color", 4:"lattice", 5:"other", 6:"color block", 7:"NA"
}

pattern_raw["upper_pattern"] = pattern_raw["upper_code"].map(pattern_map)
pattern_raw["lower_pattern"] = pattern_raw["lower_code"].map(pattern_map)
pattern_raw["outer_pattern"] = pattern_raw["outer_code"].map(pattern_map)

pattern = pattern_raw[["image","upper_pattern","lower_pattern","outer_pattern"]]
pattern.head()

  pattern_raw = pd.read_csv(pattern_file, sep="\s+", header=None, names=["image","upper_code","lower_code","outer_code"])


Unnamed: 0,image,upper_pattern,lower_pattern,outer_pattern
0,MEN-Denim-id_00000080-01_7_additional.jpg,pure color,lattice,
1,MEN-Denim-id_00000089-01_7_additional.jpg,pure color,pure color,
2,MEN-Denim-id_00000089-02_7_additional.jpg,striped,pure color,
3,MEN-Denim-id_00000089-03_7_additional.jpg,pure color,pure color,
4,MEN-Denim-id_00000089-04_7_additional.jpg,pure color,pure color,


In [18]:
shape_file = DATA_ROOT +  "/labels/shape/shape_anno_all.txt"
shape_columns = [
    "image","sleeve_length","lower_length","socks","hat","glasses","neckwear",
    "wrist_wearing","ring","waist_accessories","neckline","cardigan","covers_navel"
]

shape_raw = pd.read_csv(shape_file, sep="\s+", header=None, names=shape_columns)

# Map codes to human-readable
sleeve_map = {0:"sleeveless",1:"short-sleeve",2:"medium-sleeve",3:"long-sleeve",4:"not-long-sleeve",5:"NA"}
lower_map  = {0:"three-point",1:"medium-short",2:"three-quarter",3:"long",4:"NA"}
binary_map = {0:"no",1:"yes",2:"NA"}
neckline_map = {0:"V-shape",1:"square",2:"round",3:"standing",4:"lapel",5:"suspenders",6:"NA"}
cardigan_map = {0:"yes",1:"no",2:"NA"}
navel_map = {0:"no",1:"yes",2:"NA"}

clothes_raw = shape_raw[["image"]].copy()
clothes_raw["sleeve"] = shape_raw["sleeve_length"].map(sleeve_map)
clothes_raw["lower"] = shape_raw["lower_length"].map(lower_map)
clothes_raw["socks_label"] = shape_raw["socks"].map({0:"no",1:"socks",2:"leggings",3:"NA"})
clothes_raw["hat_label"] = shape_raw["hat"].map(binary_map)
clothes_raw["neckwear_label"] = shape_raw["neckwear"].map(binary_map)
clothes_raw["wrist_label"] = shape_raw["wrist_wearing"].map(binary_map)
clothes_raw["ring_label"] = shape_raw["ring"].map(binary_map)
clothes_raw["waist_label"] = shape_raw["waist_accessories"].map({0:"no",1:"belt",2:"clothing",3:"hidden",4:"NA"})
clothes_raw["neckline_label"] = shape_raw["neckline"].map(neckline_map)
clothes_raw["cardigan_label"] = shape_raw["cardigan"].map(cardigan_map)
clothes_raw["covers_navel_label"] = shape_raw["covers_navel"].map(navel_map)

clothes_raw.head()


  shape_raw = pd.read_csv(shape_file, sep="\s+", header=None, names=shape_columns)


Unnamed: 0,image,sleeve,lower,socks_label,hat_label,neckwear_label,wrist_label,ring_label,waist_label,neckline_label,cardigan_label,covers_navel_label
0,MEN-Denim-id_00000080-01_7_additional.jpg,,long,no,no,no,no,no,hidden,round,no,yes
1,MEN-Denim-id_00000089-01_7_additional.jpg,sleeveless,long,no,no,no,no,no,hidden,round,no,yes
2,MEN-Denim-id_00000089-02_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes
3,MEN-Denim-id_00000089-03_7_additional.jpg,short-sleeve,long,no,no,no,no,no,hidden,round,no,yes
4,MEN-Denim-id_00000089-04_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes


In [20]:
# Merge fabric into shape before feature engineering
clothes_raw = clothes_raw.merge(fabric, on="image", how="left")
clothes_raw.head()

Unnamed: 0,image,sleeve,lower,socks_label,hat_label,neckwear_label,wrist_label,ring_label,waist_label,neckline_label,cardigan_label,covers_navel_label,upper_fabric,lower_fabric,outer_fabric
0,MEN-Denim-id_00000080-01_7_additional.jpg,,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,
1,MEN-Denim-id_00000089-01_7_additional.jpg,sleeveless,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,
2,MEN-Denim-id_00000089-02_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes,cotton,cotton,
3,MEN-Denim-id_00000089-03_7_additional.jpg,short-sleeve,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,
4,MEN-Denim-id_00000089-04_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes,denim,cotton,


In [24]:
def compute_warmth(row):
    score = 0
    sleeve_scores = {"sleeveless":1,"short-sleeve":2,"medium-sleeve":3,"long-sleeve":4,"not-long-sleeve":2,"NA":2}
    score += sleeve_scores.get(row["sleeve"],2)
    if row["cardigan_label"]=="yes": score += 3
    outer = row.get("outer_fabric","NA")
    fabric_scores = {"denim":3,"cotton":2,"leather":4,"furry":5,"knitted":4,"chiffon":1,"other":2,"NA":2}
    score += fabric_scores.get(outer,2)
    return min(score,10)

def impermeability(row):
    outer = row.get("outer_fabric","NA")
    if outer in ["leather","denim"]: return 2
    if outer in ["knitted","cotton"]: return 1
    return 0

def comfort(row):
    score = 0
    if row["sleeve"] in ["short-sleeve","medium-sleeve"]: score += 2
    if row["neckline_label"] in ["round","V-shape"]: score += 2
    if row["covers_navel_label"]=="yes": score += 1
    return min(score,5)

clothes_raw["warmth_score"] = clothes_raw.apply(compute_warmth,axis=1)
clothes_raw["impermeability"] = clothes_raw.apply(impermeability,axis=1)
clothes_raw["comfort"] = clothes_raw.apply(comfort,axis=1)

clothes_raw.head()


Unnamed: 0,image,sleeve,lower,socks_label,hat_label,neckwear_label,wrist_label,ring_label,waist_label,neckline_label,cardigan_label,covers_navel_label,upper_fabric,lower_fabric,outer_fabric,warmth_score,impermeability,comfort
0,MEN-Denim-id_00000080-01_7_additional.jpg,,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,,4,0,3
1,MEN-Denim-id_00000089-01_7_additional.jpg,sleeveless,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,,3,0,3
2,MEN-Denim-id_00000089-02_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes,cotton,cotton,,6,0,1
3,MEN-Denim-id_00000089-03_7_additional.jpg,short-sleeve,long,no,no,no,no,no,hidden,round,no,yes,cotton,cotton,,4,0,5
4,MEN-Denim-id_00000089-04_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,no,yes,denim,cotton,,6,0,1


In [31]:
MASK_PATH = DATA_ROOT + "/segm/"
LABELS = {1:"top",2:"outer",3:"skirt",4:"dress",5:"pants",6:"leggings",11:"footwear"}

def extract_categories(mask_file):
    seg = np.array(Image.open(mask_file))
    present = np.unique(seg)
    categories = [LABELS[i] for i in present if i in LABELS]
    return ",".join(categories)

mask_records = []
for fn in os.listdir(MASK_PATH):
    if fn.endswith(".png"):
        # remove `_segm` to match original jpg name
        img_name = fn.replace("_segm.png", ".jpg")
        cats = extract_categories(os.path.join(MASK_PATH, fn))
        mask_records.append([img_name, cats])

mask_df = pd.DataFrame(mask_records, columns=["image", "categories"])
mask_df.head()

Unnamed: 0,image,categories
0,WOMEN-Dresses-id_00000731-12_4_full.jpg,"dress,footwear"
1,WOMEN-Sweaters-id_00000981-10_4_full.jpg,"top,pants,footwear"
2,WOMEN-Pants-id_00004223-02_1_front.jpg,"top,pants,footwear"
3,WOMEN-Dresses-id_00001076-02_7_additional.jpg,"dress,footwear"
4,WOMEN-Sweaters-id_00002608-05_7_additional.jpg,"top,pants,footwear"


In [32]:
df = clothes_raw \
    .merge(pattern, on="image", how="left") \
    .merge(mask_df, on="image", how="left")

df.head()

Unnamed: 0,image,sleeve,lower,socks_label,hat_label,neckwear_label,wrist_label,ring_label,waist_label,neckline_label,...,upper_fabric,lower_fabric,outer_fabric,warmth_score,impermeability,comfort,upper_pattern,lower_pattern,outer_pattern,categories
0,MEN-Denim-id_00000080-01_7_additional.jpg,,long,no,no,no,no,no,hidden,round,...,cotton,cotton,,4,0,3,pure color,lattice,,"top,pants,footwear"
1,MEN-Denim-id_00000089-01_7_additional.jpg,sleeveless,long,no,no,no,no,no,hidden,round,...,cotton,cotton,,3,0,3,pure color,pure color,,"top,pants,footwear"
2,MEN-Denim-id_00000089-02_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,...,cotton,cotton,,6,0,1,striped,pure color,,"top,pants,footwear"
3,MEN-Denim-id_00000089-03_7_additional.jpg,short-sleeve,long,no,no,no,no,no,hidden,round,...,cotton,cotton,,4,0,5,pure color,pure color,,"top,pants,footwear"
4,MEN-Denim-id_00000089-04_7_additional.jpg,long-sleeve,long,no,no,no,no,no,hidden,lapel,...,denim,cotton,,6,0,1,pure color,pure color,,"top,pants,footwear"


In [34]:
df_small = df.sample(500, random_state=42)
df_small.to_csv("products_small.csv", index=False)
df_small.head()

Unnamed: 0,image,sleeve,lower,socks_label,hat_label,neckwear_label,wrist_label,ring_label,waist_label,neckline_label,...,upper_fabric,lower_fabric,outer_fabric,warmth_score,impermeability,comfort,upper_pattern,lower_pattern,outer_pattern,categories
24378,WOMEN-Rompers_Jumpsuits-id_00000886-02_3_back.jpg,sleeveless,three-point,,no,yes,yes,,no,,...,knitted,knitted,,3,0,0,graphic,graphic,,
21978,WOMEN-Jackets_Coats-id_00000706-02_3_back.jpg,long-sleeve,,,yes,,no,yes,hidden,,...,,cotton,leather,8,2,0,,lattice,pure color,
22265,WOMEN-Jackets_Coats-id_00002332-01_1_front.jpg,long-sleeve,,,no,yes,yes,yes,no,V-shape,...,cotton,chiffon,knitted,10,1,3,graphic,graphic,pure color,
322,MEN-Jackets_Vests-id_00005097-05_4_full.jpg,long-sleeve,long,no,no,no,no,no,hidden,round,...,cotton,denim,cotton,9,1,3,pure color,pure color,pure color,"top,outer,pants,footwear"
32313,WOMEN-Tees_Tanks-id_00000676-02_2_side.jpg,sleeveless,three-point,,no,yes,yes,no,hidden,round,...,chiffon,denim,,3,0,3,graphic,pure color,,
