## Initialize

In [2]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

# df = fetch_train_data(path='../data/train_data_all_filled.json')
df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11898,0,3
item_name,object,11898,0,3451
brand,object,11868,30,466
category,object,11898,0,68
size,object,11898,0,124
size_main,object,10981,917,59
size_suffix,object,1621,10277,5
size_scheme,object,11861,37,4
price,float64,11898,0,443
rented_for,object,10476,1422,8


## Transform data

In [20]:
import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

prep.pipeline = [
    DropColumns(
        cols=['user_name', 'review', 'review_summary', 'rating', 'item_name']),
    OneHotEncoder(cols=[
        'size_scheme', 'size_main', 'size_suffix', 'rented_for', 'body_type'
    ],
                  name='one_hot'),
    OrdinalEncoder(cols=['fit', 'cup_size']),
    StandardScaler(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    TargetEncoder(cols=['brand', 'category', 'size'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size']),
    MinMaxScaler(cols=['price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
# describe_data(train_df_prep)
train_df_prep

<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.TargetEncoder'>
<class 'preprocess.DropColumns'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


Unnamed: 0,fit,price,usually_wear,age,height,weight,bust_size,cup_size,size_scheme_letter,size_scheme_mixed,...,brand_bust_size,brand_cup_size,category_weight,category_height,category_bust_size,category_cup_size,size_weight,size_height,size_bust_size,size_cup_size
3,1,0.368583,0.106667,2.731799e-01,6.464283e-01,3.332572e-01,-1.126084e-01,-2.762857e-01,0,0,...,0.181210,0.146587,0.075546,0.037430,0.058704,0.007734,0.690043,0.278009,0.490092,0.317671
5,0,0.010609,0.133333,-4.386530e-01,-1.210534e-01,-1.055326e-15,8.710446e-01,2.926453e+00,1,0,...,0.496320,0.791294,-0.023801,0.007974,-0.018539,-0.006979,0.885292,0.278415,0.646420,0.347851
6,0,0.116359,0.186667,2.731799e-01,2.626874e-01,2.014006e+00,2.838351e+00,1.858873e+00,0,0,...,0.088353,-0.006521,0.010687,-0.025118,0.102848,0.013871,1.579515,0.323446,1.197758,0.654034
7,1,0.106092,0.106667,6.979911e-02,1.938408e-15,3.332572e-01,8.710446e-01,2.575040e-01,1,0,...,-0.280423,-0.056325,0.010687,-0.025118,0.102848,0.013871,1.774315,0.302545,1.270694,0.677960
8,0,0.112936,0.053333,1.900227e+00,-1.272276e+00,-7.172110e-01,-1.126084e-01,2.575040e-01,1,0,...,-0.280423,-0.056325,-0.004851,-0.021670,0.004945,0.020754,0.126421,0.144661,0.065058,0.041271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70208,1,0.077687,0.000000,-7.437242e-01,-8.885351e-01,-1.055326e-15,-1.096261e+00,-1.343865e+00,1,0,...,-0.080091,0.019284,-0.004851,-0.021670,0.004945,0.020754,-1.009830,-0.584478,-0.781861,-0.436907
70209,1,0.120465,0.106667,1.714895e-01,1.413910e+00,-1.055326e-15,-1.126084e-01,2.575040e-01,0,0,...,0.127512,0.052904,0.079112,0.039352,0.045613,0.003747,0.552901,0.816980,0.338233,0.146298
70210,2,0.278234,0.026667,-1.335817e-01,1.938408e-15,-1.137398e+00,-2.079914e+00,2.575040e-01,0,0,...,-0.344056,-0.244886,0.075546,0.037430,0.058704,0.007734,0.193297,0.112480,0.200321,0.062735
70211,1,0.122177,0.080000,1.714895e-01,-1.210534e-01,-7.172110e-01,-1.126084e-01,-8.100755e-01,1,0,...,0.228071,0.105179,-0.023179,-0.010901,-0.007881,-0.011695,-0.486124,-0.112149,-0.405260,-0.224546


## Profiling report

In [223]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [15]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=1000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.679526,0.428966,0.369357,0.343656,0.58854,664,11134,100


## Multinomial Logistic Regression with sklearn

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

Feature names must be in the same order as they were in fit.



Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.682804,0.466981,0.376742,0.351055,0.592942,773,11080,45


## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Original data shape,"(59827, 649)"
4,Transformed data shape,"(59827, 649)"
5,Transformed train set shape,"(47929, 649)"
6,Transformed test set shape,"(11898, 649)"
7,Numeric features,648


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6981,0.6676,0.6981,0.6577,0.6252,0.1582,0.2125,13.683
ridge,Ridge Classifier,0.6965,0.0,0.6965,0.6552,0.6188,0.1442,0.2011,0.26
rf,Random Forest Classifier,0.693,0.6697,0.693,0.6711,0.5976,0.0894,0.1595,7.189
lda,Linear Discriminant Analysis,0.6921,0.6654,0.6921,0.6463,0.6377,0.1922,0.2245,3.918
svm,SVM - Linear Kernel,0.692,0.0,0.692,0.6429,0.6032,0.1111,0.1712,1.751
gbc,Gradient Boosting Classifier,0.6886,0.6495,0.6886,0.7034,0.5724,0.0373,0.1124,27.369
ada,Ada Boost Classifier,0.6854,0.609,0.6854,0.6519,0.5695,0.0305,0.0864,2.476
et,Extra Trees Classifier,0.6848,0.6618,0.6848,0.6371,0.6284,0.1629,0.1938,10.424
dummy,Dummy Classifier,0.6827,0.5,0.6827,0.4661,0.554,0.0,0.0,0.032
knn,K Neighbors Classifier,0.6407,0.5858,0.6407,0.5806,0.5939,0.1053,0.115,13.877


[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

In [None]:
model = create_model('lr', cross_validation=False)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Test,0.6988,0.6784,0.6988,0.6519,0.6261,0.1541,0.2057


## Logistic Regression using PyTorch

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)


NameError: name 'train_df_prep' is not defined

## Ordinal Regression with statsmodels (Error)

In [None]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

model = OrderedModel(
    train_df_prep['fit'],
    train_df_prep.drop('fit', axis=1),
    distr='logit',
    
)
model.fit()

model.summary()

ValueError: There should not be a constant in the model

## Logistic Regression written by ChatGPT (Error)

In [8]:
import numpy as np


class MulticlassLogisticRegression:

    def __init__(self, batch_size=32, learning_rate=0.01, lambda_=0.01):
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.lambda_ = lambda_

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        # Initialize weight and bias
        self.w = np.zeros((X.shape[1], y.max() + 1))
        self.b = np.zeros(y.max() + 1)

        # Loop over the training data in mini-batches
        num_batches = len(X) // self.batch_size
        for i in range(num_batches):
            start = i * self.batch_size
            end = (i + 1) * self.batch_size
            X_batch = X[start:end]
            y_batch = y[start:end]

            # Compute predicted probabilities
            z = np.dot(X_batch, self.w) + self.b
            probs = self.sigmoid(z)

            # Compute the cost function
            cost = (-y_batch * np.log(probs) -
                    (1 - y_batch) * np.log(1 - probs)
                    ).mean() + self.lambda_ * np.sum(self.w**2)

            # Compute gradients of weight and bias
            dw = (X_batch *
                  (probs - y_batch)).mean(axis=0) + 2 * self.lambda_ * self.w
            db = (probs - y_batch).mean()

            # Update weight and bias
            self.w = self.w - self.learning_rate * dw
            self.b = self.b - self.learning_rate * db

            # Print cost every 10 mini-batches
            if i % 10 == 0:
                print(f"Cost at iteration {i}: {cost}")

    def predict(self, X):
        z = np.dot(X, self.w) + self.b
        probs = self.sigmoid(z)
        return probs.argmax(axis=1)


model = MulticlassLogisticRegression()
model.fit(inputs, labels)
y_pred = model.predict(test_df_prep.drop('fit', axis=1).values)

evaluate_model(test_df_prep['fit'], y_pred)

ValueError: operands could not be broadcast together with shapes (32,) (32,3) 