In [15]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import Preprocessor

df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

train_df = Preprocessor().cleanse(train_df, is_train=True)
train_df.reset_index(drop=True, inplace=True)
# train_df.dropna(subset=['fit'], inplace=True)

test_df = Preprocessor().cleanse(test_df)
test_df.reset_index(drop=True, inplace=True)
# test_df.dropna(subset=['fit'], inplace=True)

describe_data(train_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,47929,21993,3
item_name,object,69922,0,4089
brand,object,69783,139,483
category,object,69922,0,73
size,object,69922,0,156
size_main,object,64883,5039,62
size_suffix,object,9597,60325,7
size_scheme,object,69909,13,4
price,float64,69922,0,467
user_name,object,69922,0,6648


In [48]:
train_df['size_order'] = np.nan

# this mapping is somewhat opinionated
letter_sizes_mapping = {
    'XXXXS': '4XS',
    'XXXS': '3XS',
    'XXS': '2XS',
    'XXL': '2XL',
    'XXXL': '3XL',
    'XXXXL': '4XL',
    '0X': 'XL',
    '1X': '2XL',
    '2X': '3XL',
    '3X': '4XL',
    'P-S': 'XS-S'
}

ordered_letter_sizes = [
    '4XS', '3XS', '2XS', 'XS', 'XS-S', 'S', 'S-M', 'M', 'M-L', 'L', 'L-XL',
    'XL', '2XL', '3XL', '4XL'
]

pos = train_df['size_scheme'] == 'letter'
train_df['size_main'].replace(letter_sizes_mapping, regex=False, inplace=True)

train_df.loc[pos, 'size_order'] = train_df.loc[pos, 'size_main'].map(
    {size: i
     for i, size in enumerate(ordered_letter_sizes)})

train_df.loc[pos, 'size_order'].sort_values().unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., nan])

In [34]:
np.isnan(np.nan)

True

In [84]:
item_size_mains = train_df.groupby('item_name')['size_main'].unique()
size_scheme_items = train_df.groupby('size_scheme')['item_name'].unique()
item_size_mappings = pd.Series(index=item_size_mains.index, dtype=object)


def parse_letter_size(size):
    if size in ordered_letter_sizes:
        return ordered_letter_sizes.index(size)
    else:
        return 7


def parse_number_size(size):
    import re
    match = re.match(r'(\d+)((?:-)\d+)?', size)
    if match.group(2) is None:
        return int(match.group(1))
    else:
        return np.mean([int(match.group(1)), int(match.group(2))])


def get_size_mapping(sizes, parse_func):
    sizes = sizes[~pd.isna(sizes)]
    relative_index_mapping = {
        size: i
        for i, size in enumerate(sorted(sizes, key=lambda x: parse_func(x)))
    }
    index_mean = np.mean([relative_index_mapping[size] for size in sizes])
    index_bias_mapping = {
        size: relative_index_mapping[size] - index_mean
        for size in sizes
    }
    return index_bias_mapping


pos = size_scheme_items.loc['letter']
item_size_mappings[pos] = item_size_mains[pos].apply(
    get_size_mapping, args=(parse_letter_size, ))

pos = size_scheme_items.loc['number']
item_size_mappings[pos] = item_size_mains[pos].apply(
    get_size_mapping, args=(parse_number_size, ))

pos = size_scheme_items.loc['mixed']
item_size_mappings[pos] = item_size_mains[pos].apply(
    get_size_mapping, args=(parse_letter_size, ))

In [26]:
body_type_dummies = pd.get_dummies(train_df['body_type'], dummy_na=True)
body_type_dummies.loc[body_type_dummies[np.nan] == 1] = np.nan
body_type_dummies.drop(columns=[np.nan], inplace=True)

size_features_df = train_df[['height', 'weight',
                             'bust_size']].join([body_type_dummies])
size_features_df['cup_size'] = train_df['cup_size'].cat.codes.replace(
    -1, np.nan)
size_features_df[['item_name', 'brand', 'category', 'size'
                  ]] = train_df[['item_name', 'brand', 'category',
                                 'size']].astype(str).fillna('unknown')
size_features_df.fillna(size_features_df.mean(numeric_only=True), inplace=True)

# if brand or category is unknown, fall back to mean of all items
size_features_df.groupby(['size']).mean(numeric_only=True)
# brand size features
size_features_df.groupby(['brand', 'size']).mean(numeric_only=True)
# category size features
size_features_df.groupby(['category', 'size']).mean(numeric_only=True)
# product size features
size_features_df.groupby(['item_name', 'size']).mean(numeric_only=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,height,weight,bust_size,APPLE,ATHLETIC,FULL BUST,HOURGLASS,PEAR,PETITE,STRAIGHT & NARROW,cup_size
item_name,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
(nude)_Chevron Turtleneck Sweater,38,166.078905,55.035874,32.333333,0.005669,0.042690,0.013155,0.383882,0.021017,0.519080,0.014508,2.166667
(nude)_Chevron Turtleneck Sweater,40,165.433343,61.234970,34.041353,0.013604,0.302456,0.231572,0.321317,0.050441,0.045791,0.034819,4.104224
(nude)_Chevron Turtleneck Sweater,42,166.502239,63.325577,35.402256,0.022674,0.504093,0.052619,0.202195,0.084068,0.076319,0.058032,3.840374
(nude)_Chevron Turtleneck Sweater,44,165.735000,65.420890,33.551692,0.017006,0.378070,0.289465,0.151646,0.063051,0.057239,0.043524,5.130280
(nude)_Chevron Turtleneck Sweater,46,175.468339,71.874773,36.000000,0.017006,0.128070,0.039465,0.651646,0.063051,0.057239,0.043524,4.750000
...,...,...,...,...,...,...,...,...,...,...,...,...
soon maternity_Dream Maternity Capris,8,165.100000,64.370881,34.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
soon maternity_Lily Maternity Jumpsuit,M,169.042239,64.370881,34.206767,0.034011,0.256140,0.078929,0.303292,0.126102,0.114478,0.087047,3.521121
soon maternity_Lily Maternity Jumpsuit,S,164.531119,63.036261,34.735589,0.011337,0.418713,0.026310,0.101097,0.042034,0.371493,0.029016,3.507040
soon maternity_Lily Maternity Jumpsuit,XL,165.933358,90.718474,36.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,4.000000
