## Initialize

In [157]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split, random_split_aggr
from preprocess import *

df = fetch_train_data(path='../data/train_data_all_filled.json')
# df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
test_df = prep.cleanse(test_df)

train_df.dropna(subset=['fit'], inplace=True)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,17553,0,3
item_name,object,17511,42,3604
brand,object,17518,35,515
category,object,17511,42,68
size,object,17553,0,137
size_main,object,16194,1359,52
size_suffix,object,2387,15166,5
size_scheme,object,17437,116,4
price,float64,17511,42,454
rented_for,object,16064,1489,8


## Transform data

In [158]:
prep.pipeline = [
    ##
    DropColumns(cols=['user_name', 'review', 'review_summary', 'rating']),
    HandleSizeMapping(),  # handle size mapping
    OrdinalEncoder(cols=['fit', 'item_name', 'cup_size']),  # (necessary)
    MeanImputer(
        cols=['weight', 'height', 'bust_size', 'cup_size']),  # (necessary)
    ComputeItemVectors(),  # compute item vectors
    ##
    DropColumns(cols=['size_scheme', 'size']),
    OneHotEncoder(cols=['size_suffix', 'rented_for', 'body_type']),
    StandardScaler(cols=[
        'weight', 'height', 'bust_size', 'cup_size', 'item_weight',
        'item_height', 'item_bust_size', 'item_cup_size'
    ]),
    TargetEncoder(cols=['brand', 'category', 'size_main'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size_main']),
    MinMaxScaler(cols=['age', 'price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'target_encoder' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['price', 'usually_wear']),
    # OneHotEncoder(cols=['item_name']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()

train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

# train_df_prep = prep.compute_item_vectors(train_df_prep, is_train=True)
# test_df_prep = prep.compute_item_vectors(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
# describe_data(train_df_prep)
train_df_prep

<class 'preprocess.DropColumns'>
<class 'preprocess.HandleSizeMapping'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.ComputeItemVectors'>
Optimizing weights and thresholds, round 1
Iteration 0: loss = 14.699899673461914
Iteration 100: loss = 7.130328178405762
Iteration 200: loss = 4.596743583679199
Iteration 300: loss = 3.9746382236480713
Optimizing item vectors, round 1
Iteration 0: loss = 3.9704017639160156
Iteration 100: loss = 3.9702115058898926
Iteration 200: loss = 3.970022439956665
Iteration 300: loss = 3.9698333740234375
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.9698314666748047
Iteration 100: loss = 3.7763748168945312
Iteration 200: loss = 3.6078250408172607
Iteration 300: loss = 3.4580132961273193
Optimizing item vectors, round 2
Iteration 0: loss = 3.456603527069092
Iteration 100: loss = 3.4565675258636475
Iteration 200: loss = 3.4565324783325195
Iteration 300: loss = 3.4564969539642334
Optimizing weights and 

Unnamed: 0,fit,item_name,price,usually_wear,age,height,weight,bust_size,cup_size,size_bias,...,brand_bust_size,brand_cup_size,category_weight,category_height,category_bust_size,category_cup_size,size_main_weight,size_main_height,size_main_bust_size,size_main_cup_size
0,1,193,0.106092,0.080000,0.258427,0.799328,1.486153e-15,-0.111270,-0.887221,0.0,...,-0.217710,-0.078844,-0.001785,-0.019965,0.006224,0.025498,-0.449692,-0.107182,-0.372788,-0.195148
1,2,2753,0.293292,0.026667,0.348315,0.000000,-8.497469e-01,-1.187556,1.445851,-2.0,...,-0.029302,0.013082,0.070291,0.061561,0.040798,-0.007994,-0.640293,-0.193754,-0.506089,-0.230679
2,1,3756,0.132444,0.160000,0.573034,0.000000,3.835922e-01,0.965016,1.445851,2.0,...,-0.109378,-0.131329,-0.021097,-0.012278,-0.010151,-0.012040,1.338481,0.258225,1.106327,0.574419
3,1,2374,0.368583,0.106667,0.393258,0.799328,3.835922e-01,-0.111270,-0.303953,0.0,...,-0.001317,0.176474,0.070291,0.061561,0.040798,-0.007994,0.572816,0.229287,0.431045,0.250275
4,2,519,0.149555,0.106667,0.314607,-0.156860,8.579533e-01,-0.111270,3.195655,-1.0,...,0.107591,0.107073,0.073692,0.064940,0.041734,0.060221,-0.112403,0.099056,-0.093288,-0.051093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69917,1,2607,0.077687,0.000000,0.280899,-1.113047,1.486153e-15,-1.187556,-1.470489,-2.0,...,-0.051393,-0.002144,-0.001785,-0.019965,0.006224,0.025498,-0.952809,-0.481843,-0.719651,-0.392399
69918,1,586,0.120465,0.106667,0.382022,1.755516,1.486153e-15,-0.111270,0.279315,0.5,...,0.059137,0.041865,0.059882,0.019612,0.038804,0.001096,0.224981,0.120939,0.156000,0.098727
69919,2,3335,0.278234,0.026667,0.348315,0.000000,-1.276672e+00,-2.263841,0.279315,-1.5,...,-0.050860,-0.031520,0.070291,0.061561,0.040798,-0.007994,0.224981,0.120939,0.156000,0.098727
69920,1,3367,0.122177,0.080000,0.382022,-0.156860,-8.023108e-01,-0.111270,-0.887221,-1.5,...,0.254435,0.090483,-0.021097,-0.012278,-0.010151,-0.012040,-0.449692,-0.107182,-0.372788,-0.195148


In [159]:
describe_data(test_df_prep)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,int8,17553,0,3
item_name,float64,17411,142,3511
price,float64,17553,0,454
usually_wear,float64,17553,0,24
age,float64,17553,0,63
height,float64,17553,0,22
weight,float64,17553,0,153
bust_size,float64,17553,0,12
cup_size,float64,17553,0,14
size_bias,float64,17553,0,39


In [160]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

encoder = SklearnOneHotEncoder(sparse=False, handle_unknown='ignore')
X_train, y_train = train_df_prep.drop(
    columns=['fit', 'item_name']).values, train_df_prep['fit'].values
X_test, y_test = test_df_prep.drop(
    columns=['fit', 'item_name']).values, test_df_prep['fit'].values

item_name_train = encoder.fit_transform(train_df_prep[['item_name']].values)
item_name_test = encoder.transform(test_df_prep[['item_name']].values)

X_train = np.concatenate([X_train, item_name_train], axis=1)
X_test = np.concatenate([X_test, item_name_test], axis=1)


In [161]:
X_train, X_train.shape, y_train, y_train.shape

(array([[0.10609172, 0.08      , 0.25842697, ..., 0.        , 0.        ,
         0.        ],
        [0.29329227, 0.02666667, 0.34831461, ..., 0.        , 0.        ,
         0.        ],
        [0.13244353, 0.16      , 0.57303371, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.27823409, 0.02666667, 0.34831461, ..., 0.        , 0.        ,
         0.        ],
        [0.12217659, 0.08      , 0.38202247, ..., 0.        , 0.        ,
         0.        ],
        [0.06570842, 0.10666667, 0.36149299, ..., 0.        , 0.        ,
         0.        ]]),
 (69922, 4135),
 array([1, 2, 1, ..., 2, 1, 1], dtype=int8),
 (69922,))

In [162]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000)
random_split_aggr(clf, X_train, y_train, X_test, y_test)
# random_split_aggr(clf, item_name_train, y_train, item_name_test, y_test)

(array([0, 1, 2], dtype=int8), array([ 9485, 16311, 11505]))
(array([0, 1, 2], dtype=int8), array([ 9485, 16311, 11505]))
(array([0, 1, 2], dtype=int8), array([ 9485, 16310, 11505]))


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.621603,0.519448,0.573485,0.536164,0.63769,3065,10211,4277


## Profiling report

In [223]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [116]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=2000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.685595,0.404086,0.334269,0.275133,0.560648,9,11898,33


## Multinomial Logistic Regression with sklearn

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Original data shape,"(59827, 649)"
4,Transformed data shape,"(59827, 649)"
5,Transformed train set shape,"(47929, 649)"
6,Transformed test set shape,"(11898, 649)"
7,Numeric features,648


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6981,0.6676,0.6981,0.6577,0.6252,0.1582,0.2125,13.683
ridge,Ridge Classifier,0.6965,0.0,0.6965,0.6552,0.6188,0.1442,0.2011,0.26
rf,Random Forest Classifier,0.693,0.6697,0.693,0.6711,0.5976,0.0894,0.1595,7.189
lda,Linear Discriminant Analysis,0.6921,0.6654,0.6921,0.6463,0.6377,0.1922,0.2245,3.918
svm,SVM - Linear Kernel,0.692,0.0,0.692,0.6429,0.6032,0.1111,0.1712,1.751
gbc,Gradient Boosting Classifier,0.6886,0.6495,0.6886,0.7034,0.5724,0.0373,0.1124,27.369
ada,Ada Boost Classifier,0.6854,0.609,0.6854,0.6519,0.5695,0.0305,0.0864,2.476
et,Extra Trees Classifier,0.6848,0.6618,0.6848,0.6371,0.6284,0.1629,0.1938,10.424
dummy,Dummy Classifier,0.6827,0.5,0.6827,0.4661,0.554,0.0,0.0,0.032
knn,K Neighbors Classifier,0.6407,0.5858,0.6407,0.5806,0.5939,0.1053,0.115,13.877


[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

In [None]:
model = create_model('lr', cross_validation=False)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Test,0.6988,0.6784,0.6988,0.6519,0.6261,0.1541,0.2057


## Logistic Regression using PyTorch

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)


Epoch [10/100], Loss: 2.0073
Epoch [20/100], Loss: 1.8807
Epoch [30/100], Loss: 1.7599
Epoch [40/100], Loss: 1.6436
Epoch [50/100], Loss: 1.5312
Epoch [60/100], Loss: 1.4224
Epoch [70/100], Loss: 1.3169
Epoch [80/100], Loss: 1.2148
Epoch [90/100], Loss: 1.1166
Epoch [100/100], Loss: 1.0236


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.686851,0.22895,0.333333,0.271453,0.559343,0,11940,0
