## Initialize

In [1]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split, random_split_aggr
from preprocess import *

df = fetch_train_data(path='../data/train_data_all_filled.json')
# df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
# test_df = prep.cleanse(test_df)

train_df.dropna(subset=['fit'], inplace=True)
# test_df.dropna(subset=['fit'], inplace=True)

# describe_data(test_df)

## Transform data

In [2]:
prep.pipeline = [
    ##
    DropColumns(cols=['user_name', 'review', 'review_summary', 'rating']),
    HandleSizeMapping(),  # handle size mapping
    OrdinalEncoder(cols=['fit', 'item_name', 'cup_size']),  # (necessary)
    MeanImputer(
        cols=['weight', 'height', 'bust_size', 'cup_size']),  # (necessary)
    ComputeItemVectors(),  # compute item vectors
    ##
    DropColumns(cols=['size_scheme', 'size']),
    OneHotEncoder(cols=['size_suffix', 'rented_for', 'body_type']),
    StandardScaler(cols=[
        'weight', 'height', 'bust_size', 'cup_size', 'item_weight',
        'item_height', 'item_bust_size', 'item_cup_size'
    ]),
    TargetEncoder(cols=['brand', 'category', 'size_main'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size_main']),
    MinMaxScaler(cols=['age', 'price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'target_encoder' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['price', 'usually_wear']),
    OneHotEncoder(cols=['item_name']),
    AugmentData
]

train_df_prep = train_df.copy()
test_df_prep = test_df.copy()

train_df_prep = prep.fit_transform(train_df_prep)
# test_df_prep = prep.transform(test_df_prep)

# train_df_prep = prep.compute_item_vectors(train_df_prep, is_train=True)
# test_df_prep = prep.compute_item_vectors(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
# describe_data(train_df_prep)
train_df_prep

<class 'preprocess.DropColumns'>
<class 'preprocess.HandleSizeMapping'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.ComputeItemVectors'>
Optimizing weights and thresholds, round 1
Iteration 0: loss = 14.775798797607422
Iteration 100: loss = 7.162885665893555
Iteration 200: loss = 4.61268949508667
Iteration 300: loss = 3.9812064170837402
Optimizing item vectors, round 1
Iteration 0: loss = 3.976886510848999
Iteration 100: loss = 3.9766769409179688
Iteration 200: loss = 3.976468086242676
Iteration 300: loss = 3.976259708404541
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.976257801055908
Iteration 100: loss = 3.7788496017456055
Iteration 200: loss = 3.60685658454895
Iteration 300: loss = 3.4542858600616455
Optimizing item vectors, round 2
Iteration 0: loss = 3.4528517723083496
Iteration 100: loss = 3.4528138637542725
Iteration 200: loss = 3.4527764320373535
Iteration 300: loss = 3.4527387619018555
Optimizing weights and thres

Unnamed: 0,fit,price,usually_wear,age,height,weight,bust_size,cup_size,size_bias,item_weight,...,item_name_4147,item_name_4148,item_name_4149,item_name_4150,item_name_4151,item_name_4152,item_name_4153,item_name_4154,item_name_4155,item_name_4156
0,1,0.106092,0.080000,0.258427,7.979926e-01,0.000000,-1.119150e-01,-8.849729e-01,0.0,-0.237090,...,0,0,0,0,0,0,0,0,0,0
1,2,0.293292,0.026667,0.348315,5.349337e-15,-0.850021,-1.190431e+00,1.443648e+00,-2.0,-1.204525,...,0,0,0,0,0,0,0,0,0,0
2,1,0.132444,0.160000,0.573034,5.349337e-15,0.384106,9.666011e-01,1.443648e+00,2.0,0.604990,...,0,0,0,0,0,0,0,0,0,0
3,1,0.368583,0.106667,0.393258,7.979926e-01,0.384106,-1.119150e-01,-3.028176e-01,0.0,0.081798,...,0,0,0,0,0,0,0,0,0,0
4,2,0.149555,0.106667,0.314607,-1.581297e-01,0.858771,-1.119150e-01,3.190114e+00,-1.5,0.985914,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87403,1,0.140999,0.080000,0.361303,5.349337e-15,0.000000,-3.831659e-15,2.585289e-16,0.5,0.326856,...,0,0,0,0,0,0,0,0,0,0
87404,0,0.032854,0.106667,0.404494,5.349337e-15,0.384106,-1.119150e-01,2.793376e-01,-1.0,-1.055584,...,0,0,0,0,0,0,0,0,0,0
87405,2,0.071869,0.186667,0.483146,1.276054e+00,0.000000,9.666011e-01,-3.028176e-01,0.5,2.155005,...,0,0,0,0,0,0,0,0,0,0
87406,0,0.048597,0.026667,0.292135,-1.581297e-01,-0.802555,-1.119150e-01,-8.849729e-01,-1.0,-1.146586,...,0,0,0,0,0,0,0,0,0,0


In [4]:
describe_data(train_df_prep)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,int8,87408,0,3
price,float64,87408,0,471
usually_wear,float64,87408,0,50
age,float64,87408,0,79
height,float64,87408,0,25
...,...,...,...,...
item_name_4152,uint8,87408,0,2
item_name_4153,uint8,87408,0,2
item_name_4154,uint8,87408,0,2
item_name_4155,uint8,87408,0,2


In [5]:
from utils import data_augmentation

train_df_prep_aug = data_augmentation(
    train_df_prep, ['cup_size', 'bust_size', 'weight', 'height'],
    ratio_small=3.6,
    ratio_large=2.7)

X_train, y_train = train_df_prep_aug.drop(
    columns=['fit']).values, train_df_prep_aug['fit'].values
# X_test, y_test = test_df_prep.drop(
#     columns=['fit']).values, test_df_prep['fit'].values

In [None]:
X_train, X_train.shape, y_train, y_train.shape

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
# import models
# importlib.reload(models)
# from models import LogisticClassifier

clf = LogisticRegression(max_iter=1000)
cv_results = cross_validate(clf,
                            X_train,
                            y_train,
                            cv=5,
                            scoring='f1_macro',
                            return_train_score=True,
                            n_jobs=-1)
cv_results

# clf = LogisticClassifier()
# random_split_aggr(clf, X_train, y_train, X_test, y_test)
# random_split_aggr(clf, item_name_train, y_train, item_name_test, y_test)

KeyboardInterrupt: 

## Profiling report

In [None]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [None]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=2000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

## Multinomial Logistic Regression with sklearn

## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


In [None]:
best_model = compare_models()

In [None]:
model = create_model('lr', cross_validation=False)

## Logistic Regression using PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)
