In [1]:
import preprocess as P

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

In [6]:
import importlib
importlib.reload(P)
preproc = P.Preprocessor()
train_df = pd.read_json('../data/train_data.json')
train_df = preproc.transform(train_df)
train_df.sample(5)

Unnamed: 0,fit,item_name1,item_name2,size,price,rented_for,usually_wear,age,height,weight,body_type,bust_size,cup_size
21445,True to Size,Sandro,Gael Pleated Skort,38,325.0,,4.0,31.0,,,HOURGLASS,32.0,DD
66360,True to Size,MINKPINK,Running Wild Sweater,S,89.0,Other,2.0,41.0,162.56,,,32.0,D
36040,,VINCE.,Tuxedo Jacket,4,445.0,Work,2.0,35.0,165.1,61.23497,,,
49849,,Slate & Willow,Slate Cold Shoulder Blouse,L,165.0,Work,8.0,34.0,165.1,68.038855,,,
60988,,Unreal Fur,Wild at Heart Faux Fur Vest,S,339.0,Work,6.0,56.0,165.1,,,34.0,DD


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87576 entries, 0 to 87765
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   fit           59827 non-null  category
 1   item_name1    87402 non-null  category
 2   item_name2    87576 non-null  category
 3   size          81218 non-null  category
 4   price         87576 non-null  float64 
 5   rented_for    80076 non-null  category
 6   usually_wear  87013 non-null  float64 
 7   age           85416 non-null  float64 
 8   height        55981 non-null  float64 
 9   weight        70069 non-null  float64 
 10  body_type     61062 non-null  category
 11  bust_size     73761 non-null  float64 
 12  cup_size      73761 non-null  category
dtypes: category(7), float64(6)
memory usage: 5.7 MB


In [6]:
print(train_df.groupby('item_name1')['item_name2'].unique().apply(lambda x: len(x)).sort_values(ascending=False).to_string())

item_name1
Slate & Willow                    45
Louna                             44
Veronica Beard                    43
A.L.C.                            41
Rebecca Taylor                    41
Marissa Webb Collective           39
Jonathan Simkhai                  38
Parker                            38
Scotch & Soda                     36
Tory Burch                        36
Trina Turk                        36
Club Monaco                       36
Tibi                              34
Rails                             33
VINCE.                            33
Tanya Taylor                      32
BLANKNYC                          32
Madewell                          32
AMUR                              31
Peter Som Collective              31
Central Park West                 31
Black Halo                        31
J.Crew                            30
Milly                             30
Joie                              29
Derek Lam 10 Crosby               29
Ulla Johnson               

In [7]:
print(train_df.groupby('item_name2')['size'].unique().to_string())

item_name2
60s Sweater Vest                                                                             [S]
70s High Flare Jeans                                                            [26, 28, 31, 27]
70s Pocket Mini Skirt                                                           [27, 29, 26, 30]
835 Mid Rise Crop Skinny Maternity Jeans                               [26R, 31R, 25R, 27R, 29R]
90s Icon Snake Faux Leather Pants                                 [8, <NA>, 16, 10, 2, 6, 14, 4]
A Foreign Affair Romper                                                [6, <NA>, 0, 8, 10, 2, 4]
A Touch Of Magic Sweater                                                 [S, <NA>, XL, M, L, XS]
Aaliyah Jumpsuit                                                    [SR, LR, XLR, <NA>, MR, XSR]
Abbot Sweater                                                                [XS, L, M, S, <NA>]
Abel Patchwork Sweater                                                         [S, XS, M, L, XL]
Abril 2 Sweater    

In [33]:
df = train_df.dropna()

# split train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('fit', axis=1),
                                                    df['fit'],
                                                    test_size=0.2,
                                                    random_state=0)

# feature engineering
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

num_cols = X_train.select_dtypes(include=['float64']).columns
cat_cols = X_train.select_dtypes(include=['category']).columns

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

# train model
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='multinomial',
                             random_state=0,
                             max_iter=1000)
log_reg.fit(X_train_prepared, y_train)

# evaluate model
from sklearn.metrics import accuracy_score

y_train_pred = log_reg.predict(X_train_prepared)
y_test_pred = log_reg.predict(X_test_prepared)

print('Train accuracy:', accuracy_score(y_train, y_train_pred))
print('Test accuracy:', accuracy_score(y_test, y_test_pred))