## Initialize

In [21]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split, random_split_aggr
from preprocess import *

df = fetch_train_data(path='../data/train_data_all_filled.json')
train_df, test_df = train_test_split(df, test_size=0.2)

# Preprocess train data
prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

prep.pipeline = [
    ##
    DropColumns(cols=['user_name', 'review', 'review_summary', 'rating']),
    HandleSizeMapping(),  # handle size mapping
    OrdinalEncoder(cols=['fit', 'item_name', 'cup_size']),  # (necessary)
    MeanImputer(
        cols=['weight', 'height', 'bust_size', 'cup_size']),  # (necessary)
    ComputeItemVectors(),  # compute item vectors
    ##
    DropColumns(cols=['size_scheme', 'size']),
    OneHotEncoder(cols=['size_suffix', 'rented_for', 'body_type']),
    StandardScaler(cols=[
        'weight', 'height', 'bust_size', 'cup_size', 'item_weight',
        'item_height', 'item_bust_size', 'item_cup_size'
    ]),
    TargetEncoder(cols=['brand', 'category', 'size_main'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size_main']),
    MinMaxScaler(cols=['age', 'price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'target_encoder' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['price', 'usually_wear']),
    OneHotEncoder(cols=['item_name']),
    AugmentData(target_cols=['weight', 'height', 'bust_size', 'cup_size']),
]

train_df_prep = train_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)

X_train = train_df_prep.drop(columns=['fit']).to_numpy(dtype=np.float16)
y_train = train_df_prep['fit'].to_numpy(dtype=np.float16)

# Preprocess test data
test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

test_df_prep = test_df.copy()
test_df_prep = prep.transform(test_df_prep)

X_test = test_df_prep.drop(columns=['fit']).to_numpy(dtype=np.float16)
y_test = test_df_prep['fit'].to_numpy(dtype=np.float16)

<class 'preprocess.DropColumns'>
<class 'preprocess.HandleSizeMapping'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.ComputeItemVectors'>
Optimizing weights and thresholds, round 1
Iteration 0: loss = 14.699899673461914
Iteration 100: loss = 7.130328178405762
Iteration 200: loss = 4.596743583679199
Iteration 300: loss = 3.9746382236480713
Optimizing item vectors, round 1
Iteration 0: loss = 3.9704017639160156
Iteration 100: loss = 3.9702160358428955
Iteration 200: loss = 3.9700305461883545
Iteration 300: loss = 3.9698455333709717
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.969844102859497
Iteration 100: loss = 3.7763874530792236
Iteration 200: loss = 3.6078364849090576
Iteration 300: loss = 3.458024024963379
Optimizing item vectors, round 2
Iteration 0: loss = 3.456613779067993
Iteration 100: loss = 3.4565789699554443
Iteration 200: loss = 3.456544876098633
Iteration 300: loss = 3.456510543823242
Optimizing weights and thr

In [17]:
X_train, X_train.shape, y_train, y_train.shape

(array([[0.1061 , 0.08   , 0.2585 , ..., 0.     , 0.     , 0.     ],
        [0.2932 , 0.02667, 0.3484 , ..., 0.     , 0.     , 0.     ],
        [0.1324 , 0.16   , 0.573  , ..., 0.     , 0.     , 0.     ],
        ...,
        [0.0828 , 0.1067 , 0.2472 , ..., 0.     , 0.     , 0.     ],
        [0.144  , 0.1067 , 0.3484 , ..., 0.     , 0.     , 0.     ],
        [0.2783 , 0.02667, 0.3484 , ..., 0.     , 0.     , 0.     ]],
       dtype=float16),
 (135132, 4135),
 array([1., 2., 1., ..., 2., 2., 2.], dtype=float16),
 (135132,))

In [22]:
X_test, X_test.shape, y_test, y_test.shape

(array([[0.05716, 0.     , 0.2585 , ..., 0.     , 0.     , 0.     ],
        [0.1324 , 0.02667, 0.4382 , ..., 0.     , 0.     , 0.     ],
        [0.11536, 0.1866 , 0.4158 , ..., 0.     , 0.     , 0.     ],
        ...,
        [0.0445 , 0.05334, 0.3035 , ..., 0.     , 0.     , 0.     ],
        [0.218  , 0.2134 , 0.2922 , ..., 0.     , 0.     , 0.     ],
        [0.1735 , 0.05334, 0.2023 , ..., 0.     , 0.     , 0.     ]],
       dtype=float16),
 (17553, 4135),
 array([1., 1., 1., ..., 1., 1., 1.], dtype=float16),
 (17553,))

In [18]:
from sklearn.linear_model import LogisticRegression

num_samples = X_train.shape[0]

model = LogisticRegression(
    penalty='l2',
    C=0.01,
    solver='lbfgs',
    max_iter=100,
    multi_class='multinomial',
    verbose=1,
    n_jobs=-1,
)

model.fit(X_train[num_samples // 5:], y_train[num_samples // 5:])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
evaluate_model(y_test, model.predict(X_test))

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.611292,0.373559,0.357781,0.355174,0.578665,928,14121,2504


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
# import models
# importlib.reload(models)
# from models import LogisticClassifier

clf = LogisticRegression(max_iter=1000)
cv_results = cross_validate(clf,
                            X_train,
                            y_train,
                            cv=5,
                            scoring='f1_macro',
                            return_train_score=True,
                            n_jobs=-1)
cv_results

# clf = LogisticClassifier()
# random_split_aggr(clf, X_train, y_train, X_test, y_test)
# random_split_aggr(clf, item_name_train, y_train, item_name_test, y_test)

KeyboardInterrupt: 

# Legacy

## Profiling report

In [None]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## OrdinalClassifier copied from StackOverflow

In [None]:
from sklearn import clone
from sklearn.linear_model import LogisticRegression


class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0] - 1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:, 1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[i - 1][:, 1] -
                                 clfs_predict[i][:, 1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i - 1][:, 1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


model = OrdinalClassifier(LogisticRegression(max_iter=2000))
model.fit(train_df_prep.drop('fit', axis=1), train_df_prep['fit'])
y_pred = model.predict(test_df_prep.drop('fit', axis=1))

evaluate_model(test_df_prep['fit'], y_pred)

## Multinomial Logistic Regression with sklearn

## Auto ML with PyCaret (Incorrect Metrics)

In [None]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


In [None]:
best_model = compare_models()

In [None]:
model = create_model('lr', cross_validation=False)

## Logistic Regression using PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

input_dim = train_df_prep.drop('fit', axis=1).shape[1]
output_dim = 3
inputs = torch.tensor(train_df_prep.drop('fit', axis=1).values,
                      dtype=torch.float32)
labels = torch.tensor(train_df_prep['fit'].values, dtype=torch.long)


# Define the model
class LogisticRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

num_epochs = 100
learning_rate = 0.01
lamda = 1

# Define the loss function and the optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels) + lamda * torch.norm(model.linear.weight)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    predicted = model(
        torch.tensor(test_df_prep.drop('fit', axis=1).values,
                     dtype=torch.float32))
    _, predicted = torch.max(predicted.data, 1)
    y_pred = predicted.numpy()

evaluate_model(test_df_prep['fit'], y_pred)
