## Initialize

In [312]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

# df = fetch_train_data(path='../data/train_data_all_filled.json')
df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11898,0,3
item_name,object,11898,0,3451
brand,object,11868,30,466
category,object,11898,0,68
size,object,11898,0,124
size_main,object,10981,917,59
size_suffix,object,1621,10277,5
size_scheme,object,11861,37,4
price,float64,11898,0,443
rented_for,object,10476,1422,8


## Transform data

In [263]:
prep.pipeline = [
    DropColumns([
        'user_name', 'review', 'review_summary', 'rating', 'size'
    ]),
    OneHotEncoder([
        'size_scheme', 'size_main', 'size_suffix', 'brand', 'category',
        'rented_for', 'body_type', 'item_name'
    ],
                  name='one_hot'),
    OrdinalEncoder(['fit', 'cup_size']),
    StandardScaler(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MinMaxScaler(['price', 'usually_wear']),
    SelectOutputColumns(
        'one_hot'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

describe_data(train_df_prep)['nan_count'].sum()


<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


0

In [270]:
print(describe_data(test_df_prep)['unique_count'].to_string())

fit                                                                            3
price                                                                        443
usually_wear                                                                  24
age                                                                           62
height                                                                        22
weight                                                                       148
bust_size                                                                     12
cup_size                                                                      14
size_scheme_number                                                             2
size_scheme_letter                                                             2
size_scheme_mixed                                                              2
size_scheme_onesize                                                            2
size_main_10                

## Profiling report

In [223]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## Auto ML with PyCaret
Last modified: Jan 7

In [210]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Original data shape,"(59827, 655)"
4,Transformed data shape,"(59827, 655)"
5,Transformed train set shape,"(47929, 655)"
6,Transformed test set shape,"(11898, 655)"
7,Numeric features,654
8,Rows with missing values,80.1%


In [211]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6827,0.5,0.6827,0.4661,0.554,0.0,0.0,0.028


[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

In [17]:
best_model

## Logistic Regression
Last modified: Jan 3

-   NaN dropped
-   Oversampled data

## Ordinal Regression with statsmodels

In [215]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

model = OrderedModel(
    train_df_prep['fit'],
    train_df_prep.drop('fit', axis=1),
    distr='logit',
    
)
model.fit()

model.summary()

ValueError: There should not be a constant in the model

## OrdinalClassifier copied from StackOverflow

In [304]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression())
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.618087,0.373367,0.359983,0.354995,0.573198,1055,9868,975


## Multinomial Logistic Regression with sklearn

In [302]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.6059,0.372036,0.361644,0.358883,0.569806,1225,9579,1094


## Random Numbers

In [313]:
y_pred = np.random.randint(0, 3, size=test_df_prep.shape[0])
evaluate_model(test_df_prep['fit'], y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.338712,0.338097,0.341459,0.297653,0.379527,3976,3981,3941


## Logistic Regression using PyTorch

In [314]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Logistic Regression
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return F.log_softmax(self.linear(x), dim=1)

    def predict(self, x):
        with torch.no_grad():
            return torch.argmax(self.forward(x), dim=1)

    def predict_proba(self, x):
        with torch.no_grad():
            return torch.exp(self.forward(x))


X_train = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                       dtype=torch.float)
y_train = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)
X_test = torch.tensor(test_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float)
y_test = torch.tensor(test_df_prep['fit'].values, dtype=torch.long)

model = LogisticRegression(X_train.shape[1], 3)
optimizer = optim.SGD(model.parameters(), lr=0.01)

num_epochs, lamda = 100, 1
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    out = model(X_train)
    loss = F.nll_loss(out, y_train) + lamda * model.linear.weight.norm(2)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}: train loss: {loss.item()}')

y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.685746,0.228582,0.333333,0.271194,0.55791,0,11898,0
