## Initialize

In [97]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

# df = fetch_train_data(path='../data/train_data_all_filled.json')
df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
test_df = prep.cleanse(test_df)

train_df.dropna(subset=['fit'], inplace=True)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11940,0,3
item_name,object,11898,42,3451
brand,object,11910,30,505
category,object,11898,42,68
size,object,11940,0,124
size_main,object,10981,959,52
size_suffix,object,1621,10319,5
size_scheme,object,11861,79,4
price,float64,11898,42,443
rented_for,object,10476,1464,8


## Transform data

In [114]:
prep.pipeline = [
    ##
    DropColumns(cols=['user_name', 'review', 'review_summary', 'rating']),
    HandleSizeMapping(),  # handle size mapping
    OrdinalEncoder(cols=['fit', 'item_name', 'cup_size']),  # (necessary)
    MeanImputer(
        cols=['weight', 'height', 'bust_size', 'cup_size']),  # (necessary)
    ComputeItemVectors(),  # compute item vectors
    ##
    DropColumns(cols=['size_scheme', 'size']),
    OneHotEncoder(cols=['size_suffix', 'rented_for', 'body_type']),
    StandardScaler(cols=[
        'weight', 'height', 'bust_size', 'cup_size', 'item_weight',
        'item_height', 'item_bust_size', 'item_cup_size'
    ]),
    TargetEncoder(cols=['brand', 'category', 'size_main'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size_main']),
    MinMaxScaler(cols=['age', 'price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'target_encoder' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['price', 'usually_wear']),
    # OneHotEncoder(cols=['item_name']),
    DropColumns(cols=['item_name']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()

train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

# train_df_prep = prep.compute_item_vectors(train_df_prep, is_train=True)
# test_df_prep = prep.compute_item_vectors(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
# describe_data(train_df_prep)
train_df_prep

<class 'preprocess.DropColumns'>
<class 'preprocess.HandleSizeMapping'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.ComputeItemVectors'>
Optimizing weights and thresholds, round 1
Iteration 0: loss = 15.833657264709473
Iteration 100: loss = 9.189557075500488
Iteration 200: loss = 5.168092250823975
Iteration 300: loss = 3.670894145965576
Optimizing item vectors, round 1
Iteration 0: loss = 3.6612110137939453
Iteration 100: loss = 3.660952568054199
Iteration 200: loss = 3.6606948375701904
Iteration 300: loss = 3.6604366302490234
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.6604340076446533
Iteration 100: loss = 3.268646478652954
Iteration 200: loss = 3.0392324924468994
Iteration 300: loss = 2.9223568439483643
Optimizing item vectors, round 2
Iteration 0: loss = 2.921560764312744
Iteration 100: loss = 2.9215593338012695
Iteration 200: loss = 2.921557903289795
Iteration 300: loss = 2.9215564727783203
Optimizing weights and thr

Unnamed: 0,fit,price,usually_wear,age,height,weight,bust_size,cup_size,size_bias,item_weight,...,brand_bust_size,brand_cup_size,category_weight,category_height,category_bust_size,category_cup_size,size_main_weight,size_main_height,size_main_bust_size,size_main_cup_size
0,1,0.368583,0.106667,0.364706,7.904861e-01,3.774598e-01,-1.241248e-01,-0.304541,-0.5,0.164854,...,0.161896,0.130963,0.063490,0.029358,0.051066,0.006728,0.576644,0.231175,0.426416,0.253125
1,0,0.010609,0.133333,0.282353,-1.480304e-01,1.491038e-15,9.601256e-01,3.225739,1.5,-0.149035,...,0.441871,0.704484,-0.020704,0.006889,-0.016700,-0.006287,0.723741,0.234654,0.569793,0.304305
2,0,0.116359,0.186667,0.364706,3.212278e-01,2.281141e+00,3.128626e+00,2.048979,2.5,1.212187,...,0.082089,-0.006059,0.009259,-0.020859,0.092215,0.012437,1.270719,0.273107,1.037903,0.616235
3,1,0.106092,0.106667,0.341176,5.250835e-15,3.774598e-01,9.601256e-01,0.283839,2.0,0.688766,...,-0.265243,-0.053276,0.009259,-0.020859,0.092215,0.012437,1.357631,0.260520,1.121602,0.593833
4,0,0.112936,0.053333,0.552941,-1.555805e+00,-8.123406e-01,-1.241248e-01,0.283839,0.0,-1.108209,...,-0.265243,-0.053276,-0.004253,-0.017865,0.004514,0.018947,0.108668,0.124218,0.057462,0.029988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47924,1,0.077687,0.000000,0.247059,-1.086547e+00,1.491038e-15,-1.208375e+00,-1.481301,-2.0,-0.890805,...,-0.073670,0.017738,-0.004253,-0.017865,0.004514,0.018947,-0.943375,-0.488109,-0.708921,-0.392179
47925,1,0.120465,0.106667,0.352941,1.729003e+00,1.491038e-15,-1.241248e-01,0.283839,2.5,1.195625,...,0.113955,0.047279,0.070045,0.027665,0.040939,0.003363,0.210663,0.110825,0.178598,0.077566
47926,2,0.278234,0.026667,0.317647,5.250835e-15,-1.288261e+00,-2.292626e+00,0.283839,-2.0,-0.793394,...,-0.322356,-0.229441,0.063490,0.029358,0.051066,0.006728,0.210663,0.110825,0.178598,0.077566
47927,1,0.122177,0.080000,0.352941,-1.480304e-01,-8.123406e-01,-1.241248e-01,-0.892921,-1.5,-0.676964,...,0.210352,0.097007,-0.020612,-0.009487,-0.007215,-0.010706,-0.449384,-0.096200,-0.376045,-0.207146


In [115]:
describe_data(test_df_prep)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,int8,11940,0,3
price,float64,11940,0,443
usually_wear,float64,11940,0,24
age,float64,11940,0,62
height,float64,11940,0,22
weight,float64,11940,0,148
bust_size,float64,11940,0,12
cup_size,float64,11940,0,14
size_bias,float64,11940,0,34
item_weight,float32,11940,0,7683


## Profiling report

In [223]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [116]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=2000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.685595,0.404086,0.334269,0.275133,0.560648,9,11898,33


## Multinomial Logistic Regression with sklearn

In [117]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000)
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.685511,0.364967,0.333703,0.273786,0.56,6,11905,29


## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Original data shape,"(59827, 649)"
4,Transformed data shape,"(59827, 649)"
5,Transformed train set shape,"(47929, 649)"
6,Transformed test set shape,"(11898, 649)"
7,Numeric features,648


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6981,0.6676,0.6981,0.6577,0.6252,0.1582,0.2125,13.683
ridge,Ridge Classifier,0.6965,0.0,0.6965,0.6552,0.6188,0.1442,0.2011,0.26
rf,Random Forest Classifier,0.693,0.6697,0.693,0.6711,0.5976,0.0894,0.1595,7.189
lda,Linear Discriminant Analysis,0.6921,0.6654,0.6921,0.6463,0.6377,0.1922,0.2245,3.918
svm,SVM - Linear Kernel,0.692,0.0,0.692,0.6429,0.6032,0.1111,0.1712,1.751
gbc,Gradient Boosting Classifier,0.6886,0.6495,0.6886,0.7034,0.5724,0.0373,0.1124,27.369
ada,Ada Boost Classifier,0.6854,0.609,0.6854,0.6519,0.5695,0.0305,0.0864,2.476
et,Extra Trees Classifier,0.6848,0.6618,0.6848,0.6371,0.6284,0.1629,0.1938,10.424
dummy,Dummy Classifier,0.6827,0.5,0.6827,0.4661,0.554,0.0,0.0,0.032
knn,K Neighbors Classifier,0.6407,0.5858,0.6407,0.5806,0.5939,0.1053,0.115,13.877


[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

In [None]:
model = create_model('lr', cross_validation=False)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Test,0.6988,0.6784,0.6988,0.6519,0.6261,0.1541,0.2057


## Logistic Regression using PyTorch

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)


Epoch [10/100], Loss: 2.0073
Epoch [20/100], Loss: 1.8807
Epoch [30/100], Loss: 1.7599
Epoch [40/100], Loss: 1.6436
Epoch [50/100], Loss: 1.5312
Epoch [60/100], Loss: 1.4224
Epoch [70/100], Loss: 1.3169
Epoch [80/100], Loss: 1.2148
Epoch [90/100], Loss: 1.1166
Epoch [100/100], Loss: 1.0236


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.686851,0.22895,0.333333,0.271453,0.559343,0,11940,0
