## Initialize

In [4]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split, random_split_aggr
from preprocess import *

df = fetch_train_data(path='../data/train_data_all_filled.json')
train_df, test_df = train_test_split(df, test_size=0.2)

# Preprocess train data
prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

prep.pipeline = [
    ##
    DropColumns(cols=['user_name', 'review', 'review_summary', 'rating']),
    HandleSizeMapping(),  # handle size mapping
    OrdinalEncoder(cols=['fit', 'item_name', 'cup_size']),  # (necessary)
    MeanImputer(
        cols=['weight', 'height', 'bust_size', 'cup_size']),  # (necessary)
    ComputeItemVectors(),  # compute item vectors
    ##
    DropColumns(cols=['size_scheme', 'size']),
    OneHotEncoder(cols=['size_suffix', 'rented_for', 'body_type']),
    StandardScaler(cols=[
        'weight', 'height', 'bust_size', 'cup_size', 'item_weight',
        'item_height', 'item_bust_size', 'item_cup_size'
    ]),
    MinMaxScaler(cols=['age', 'price', 'usually_wear']),
    ConstantScaler(
        cols=[
            'age', 'price', 'usually_wear', 'weight', 'height', 'bust_size',
            'cup_size', 'item_weight', 'item_height', 'item_bust_size',
            'item_cup_size'
        ],
        value=1e-4
    ),  # Multiplying by 1e-4 to downscale the effect of these features
    TargetEncoder(cols=['brand', 'category', 'size_main'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size_main']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'target_encoder' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['price', 'usually_wear']),
    OneHotEncoder(cols=['item_name']),
    AugmentData(target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                ratio_small=0.2,
                ratio_large=0.15),
]

train_df_prep = train_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)

X_train = train_df_prep.drop(columns=['fit']).to_numpy(dtype=np.float16)
y_train = train_df_prep['fit'].to_numpy(dtype=np.float16)

# Preprocess test data
test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

test_df_prep = test_df.copy()
test_df_prep = prep.transform(test_df_prep)

X_test = test_df_prep.drop(columns=['fit']).to_numpy(dtype=np.float16)
y_test = test_df_prep['fit'].to_numpy(dtype=np.float16)

<class 'preprocess.DropColumns'>
<class 'preprocess.HandleSizeMapping'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.ComputeItemVectors'>
Optimizing weights and thresholds, round 1
Iteration 0: loss = 14.699899673461914
Iteration 100: loss = 7.130328178405762
Iteration 200: loss = 4.596743583679199
Iteration 300: loss = 3.9746382236480713
Optimizing item vectors, round 1
Iteration 0: loss = 3.9704017639160156
Iteration 100: loss = 3.9702088832855225
Iteration 200: loss = 3.970015287399292
Iteration 300: loss = 3.969823122024536
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.9698212146759033
Iteration 100: loss = 3.7763686180114746
Iteration 200: loss = 3.607821226119995
Iteration 300: loss = 3.4580116271972656
Optimizing item vectors, round 2
Iteration 0: loss = 3.456601142883301
Iteration 100: loss = 3.4565646648406982
Iteration 200: loss = 3.456528425216675
Iteration 300: loss = 3.4564919471740723
Optimizing weights and thr

In [5]:
X_train, X_train.shape, y_train, y_train.shape

(array([[1.061e-05, 7.987e-06, 2.587e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [2.933e-05, 2.682e-06, 3.481e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.323e-05, 1.597e-05, 5.728e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [2.599e-05, 5.305e-06, 3.260e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.919e-05, 2.664e-05, 3.612e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.752e-05, 1.067e-05, 3.147e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00]], dtype=float16),
 (73545, 4135),
 array([1., 2., 1., ..., 2., 2., 2.], dtype=float16),
 (73545,))

In [6]:
X_test, X_test.shape, y_test, y_test.shape

(array([[5.722e-06, 0.000e+00, 2.587e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.323e-05, 2.682e-06, 4.381e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.150e-05, 1.866e-05, 4.154e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [4.470e-06, 5.305e-06, 3.034e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [2.182e-05, 2.134e-05, 2.921e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.734e-05, 5.305e-06, 2.021e-05, ..., 0.000e+00, 0.000e+00,
         0.000e+00]], dtype=float16),
 (17553, 4135),
 array([1., 1., 1., ..., 1., 1., 1.], dtype=float16),
 (17553,))

In [7]:
from sklearn.linear_model import LogisticRegression

num_samples = X_train.shape[0]

model = LogisticRegression(
    penalty='l2',
    C=1,
    solver='sag',
    max_iter=20,
    multi_class='multinomial',
    verbose=1,
    n_jobs=-1,
)

# model.fit(X_train[num_samples // 5:], y_train[num_samples // 5:])
# model.fit(X_train, y_train)
random_split_aggr(model, X_train, y_train, X_test, y_test)

(array([0., 1., 2.], dtype=float16), array([11382, 16311, 13231]))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.42916303
Epoch 3, change: 0.21372357
Epoch 4, change: 0.15203898
Epoch 5, change: 0.10946079
Epoch 6, change: 0.08948606
Epoch 7, change: 0.05997324
Epoch 8, change: 0.04547160
Epoch 9, change: 0.03795874
Epoch 10, change: 0.03446740
Epoch 11, change: 0.02761528
Epoch 12, change: 0.01753339
Epoch 13, change: 0.02758795
Epoch 14, change: 0.02024626
Epoch 15, change: 0.01348880
Epoch 16, change: 0.00828954
Epoch 17, change: 0.00691025
Epoch 18, change: 0.00572338
Epoch 19, change: 0.00496604
max_iter reached after 51 secondsEpoch 20, change: 0.00426411

(array([0., 1., 2.], dtype=float16), array([11382, 16311, 13231]))


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   51.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.36676316
Epoch 3, change: 0.23528819
Epoch 4, change: 0.13489953
Epoch 5, change: 0.09846922
Epoch 6, change: 0.09314891
Epoch 7, change: 0.07150113
Epoch 8, change: 0.04654496
Epoch 9, change: 0.03272169
Epoch 10, change: 0.03172053
Epoch 11, change: 0.02656257
Epoch 12, change: 0.02443648
Epoch 13, change: 0.01872265
Epoch 14, change: 0.01522396
Epoch 15, change: 0.01244179
Epoch 16, change: 0.00915024
Epoch 17, change: 0.00799067
Epoch 18, change: 0.00599163
Epoch 19, change: 0.00496753
max_iter reached after 51 secondsEpoch 20, change: 0.00431940

(array([0., 1., 2.], dtype=float16), array([11382, 16310, 13231]))


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   51.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.36657185
Epoch 3, change: 0.21528719
Epoch 4, change: 0.14754980
Epoch 5, change: 0.11004162
Epoch 6, change: 0.07662492
Epoch 7, change: 0.07428241
Epoch 8, change: 0.04648536
Epoch 9, change: 0.03331655
Epoch 10, change: 0.03086124
Epoch 11, change: 0.02244124
Epoch 12, change: 0.01706062
Epoch 13, change: 0.01364512
Epoch 14, change: 0.01170810
Epoch 15, change: 0.00999285
Epoch 16, change: 0.00835102
Epoch 17, change: 0.00742854
Epoch 18, change: 0.00576953
Epoch 19, change: 0.00501298
max_iter reached after 51 secondsEpoch 20, change: 0.00436611



[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   51.4s finished


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.450066,0.348522,0.353792,0.338536,0.486688,3767,9071,4715


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
# import models
# importlib.reload(models)
# from models import LogisticClassifier

clf = LogisticRegression(max_iter=1000)
cv_results = cross_validate(clf,
                            X_train,
                            y_train,
                            cv=5,
                            scoring='f1_macro',
                            return_train_score=True,
                            n_jobs=-1)
cv_results

# clf = LogisticClassifier()
# random_split_aggr(clf, X_train, y_train, X_test, y_test)
# random_split_aggr(clf, item_name_train, y_train, item_name_test, y_test)

KeyboardInterrupt: 

# Legacy

## Profiling report

In [None]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [None]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=2000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

## Multinomial Logistic Regression with sklearn

## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


In [None]:
best_model = compare_models()

In [None]:
model = create_model('lr', cross_validation=False)

## Logistic Regression using PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)
