# Test Script

## Data input & Preprocess

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

PATH = '../data/train_data_all_filled.json'
#df = pd.read_json(PATH)

# Using orginal data (No BERT predict labels)
df = fetch_train_data()

In [2]:
train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11940,0,3
item_name,object,11898,42,3451
brand,object,11910,30,505
category,object,11898,42,68
size,object,11940,0,124
size_main,object,10981,959,52
size_suffix,object,1621,10319,5
size_scheme,object,11861,79,4
price,float64,11898,42,443
rented_for,object,10476,1464,8


In [3]:
train_df = prep.handle_size_mapping(train_df, is_train=True)
test_df = prep.handle_size_mapping(test_df, is_train=False)

In [4]:
prep.pipeline = [
    DropColumns(
        cols=['user_name', 'review', 'review_summary', 'rating', 'item_name']),
    OneHotEncoder(cols=[
        'size_scheme', 'size_main', 'size_suffix', 'rented_for', 'body_type'
    ],
                  name='one_hot'),
    OrdinalEncoder(cols=['fit', 'cup_size']),
    StandardScaler(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    TargetEncoder(cols=['brand', 'category', 'size'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size']),
    MinMaxScaler(cols=['price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size', 'price']),
    MedianImputer(cols=['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
describe_data(train_df_prep)

<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.TargetEncoder'>
<class 'preprocess.DropColumns'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,int8,47929,0,3
price,float64,47929,0,464
usually_wear,float64,47929,0,48
age,float64,47929,0,75
height,float64,47929,0,25
...,...,...,...,...
category_cup_size,float64,47929,0,69
size_weight,float64,47929,0,134
size_height,float64,47929,0,126
size_bust_size,float64,47929,0,115


In [5]:
from utils import data_augmentation

train_df_prep_aug = data_augmentation(train_df_prep, ['cup_size', 'bust_size', 'weight'], large_ratio=2.7, small_ratio=3.6)
train_df_prep_aug['fit'].value_counts()

1    32722
0    32540
2    30092
Name: fit, dtype: int64

In [6]:
x_train = train_df_prep_aug.drop('fit', axis=1)
x_val = test_df_prep.drop('fit', axis=1)
y_train = train_df_prep_aug['fit']
y_val = test_df_prep['fit']

## Model Evaluation

### Self designed Logistic classifier

In [7]:
# with data augmentation
from models import LogisticClassifier
from utils import evaluate_model

clf = LogisticClassifier(max_iter=1000, learning_rate=0.1, random_state=10, alpha=0.5)
#w0 = np.random.randn(x_train.shape[1] + 1, 3)
clf.fit(x_train, y_train, verbose=False)


print('==self implemented clf==')

y_pred_clf = clf.predict(x_val)
print(f'val score: {np.mean(y_pred_clf == y_val)}')
print(evaluate_model(y_val, y_pred_clf))

y_pred_clf = clf.predict(x_train)
print(f'Train score: {np.mean(y_pred_clf == y_train)}')
print(evaluate_model(y_train, y_pred_clf))

100%|██████████| 1000/1000 [00:44<00:00, 22.41it/s]


==self implemented clf==
val score: 0.4541038525963149
        accuracy  precision    recall       f1  f1_weighted  #small  \
result  0.454104   0.351215  0.353623  0.34277      0.48542    2148   

        #true2size  #large  
result        6324    3468  
Train score: 0.7126811670197369
        accuracy  precision   recall        f1  f1_weighted  #small  \
result  0.712681   0.709523  0.71477  0.708273     0.707623   34190   

        #true2size  #large  
result       26286   34878  


In [8]:
# gradient descent has almost ended
clf.grad

array([[ 4.85722573e-17,  2.08166817e-17,  2.71050543e-19],
       [-8.67361738e-18,  1.30104261e-18, -3.46944695e-18],
       [ 6.93889390e-18,  3.46944695e-18,  3.46944695e-18],
       [ 1.08420217e-19,  3.03576608e-18,  1.73472348e-18],
       [ 3.90312782e-18,  3.90312782e-18, -1.62630326e-19],
       [-1.24900090e-16, -1.73472348e-18, -9.71445147e-17],
       [-9.71445147e-17,  5.42101086e-19, -1.11022302e-16],
       [ 1.11022302e-16,  2.42861287e-17,  1.11022302e-16],
       [ 1.56125113e-17,  6.93889390e-18,  2.08166817e-17],
       [ 2.42861287e-17, -3.46944695e-17,  3.90312782e-18],
       [-1.56125113e-17, -3.46944695e-18,  3.90312782e-18],
       [-4.87890978e-19,  2.71050543e-19,  6.09863722e-20],
       [ 6.77626358e-20, -1.35525272e-19,  6.09863722e-20],
       [-1.56125113e-17,  1.56125113e-17, -3.46944695e-18],
       [-1.38777878e-17, -1.21430643e-17, -1.95156391e-18],
       [-6.07153217e-18, -6.07153217e-18,  4.33680869e-18],
       [ 7.80625564e-18,  3.46944695e-18

In [9]:
# Compare data augmentation to SMOTE algorithm
from imblearn.over_sampling import SMOTE

x_train_noaug = train_df_prep.drop('fit', axis=1)
y_train_noaug = train_df_prep['fit']
sm = SMOTE()
x_res, y_res = sm.fit_resample(x_train_noaug, y_train_noaug)

clf_sm = LogisticClassifier(max_iter=100, learning_rate=0.25, random_state=4, alpha=0.5)
#w0 = np.random.randn(x_train.shape[1] + 1, 3)
clf_sm.fit(x_res, y_res, verbose=False)


print('==self implemented clf SMOTE==')
print(f'train score: {clf_sm.score()}')


y_pred_clf_sm = clf_sm.predict(x_val)
print(f'val score: {np.mean(y_pred_clf_sm == y_val)}')
print(evaluate_model(y_val, y_pred_clf_sm))

y_pred_clf_sm = clf_sm.predict(x_res)
print(f'train score: {np.mean(y_pred_clf_sm == y_res)}')
print(evaluate_model(y_res, y_pred_clf_sm))

100%|██████████| 100/100 [00:04<00:00, 21.33it/s]


==self implemented clf SMOTE==
train score: 0.42294684513986514
val score: 0.39782244556113905
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.397822   0.376557  0.399602  0.346463     0.436883    4526   

        #true2size  #large  
result        4506    2908  
train score: 0.42294684513986514
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.422947   0.423165  0.422947  0.421543     0.421543   37094   

        #true2size  #large  
result       33356   27716  


In [42]:
# Using split train pattern
from utils import random_split_aggr

random_split_aggr(model=LogisticClassifier(max_iter=100, learning_rate=0.25, random_state=42),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy(),
                    fit_args=dict())

  3%|▎         | 3/100 [00:00<00:03, 28.31it/s]

(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))


100%|██████████| 100/100 [00:04<00:00, 24.75it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.35it/s]

(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))


100%|██████████| 100/100 [00:04<00:00, 21.18it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.77it/s]

(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))


100%|██████████| 100/100 [00:04<00:00, 22.34it/s]
  2%|▏         | 2/100 [00:00<00:05, 18.82it/s]

(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))


100%|██████████| 100/100 [00:04<00:00, 21.31it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.80it/s]

(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))


100%|██████████| 100/100 [00:04<00:00, 21.83it/s]


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.226382,0.344435,0.333682,0.224975,0.22042,3890,1445,6605


### Logistic Regression from sklearn 

In [10]:
# with data augmentation
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(x_train, y_train)

y_pred_log = logreg.predict(x_val.to_numpy())
print('Accuracy of logistic regression classifier on val set: {:.2f}'.format(logreg.score(x_val, y_val)))
print(evaluate_model(y_val, y_pred_log))

Accuracy of logistic regression classifier on val set: 0.63
        accuracy  precision    recall        f1  f1_weighted  #small  \
result   0.63124   0.426201  0.388897  0.393328     0.596232     794   

        #true2size  #large  
result        9662    1484  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# with SMOTE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(x_res, y_res)

y_pred_log = logreg.predict(x_val.to_numpy())
print('Accuracy of logistic regression classifier on val set: {:.2f}'.format(logreg.score(x_val, y_val)))
print(evaluate_model(y_val, y_pred_log))

Accuracy of logistic regression classifier on val set: 0.54
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.536348   0.411312  0.426228  0.414433     0.554923    2087   

        #true2size  #large  
result        7093    2760  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# with split train pattern
from utils import random_split_aggr

random_split_aggr(model=LogisticRegression(max_iter=2000),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy())

(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.190452,0.384446,0.383726,0.193134,0.105963,4359,150,7431


### Extra Trees Classifier

In [12]:
# with data augmentation
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, random_state=0)
etc.fit(x_train, y_train)

y_pred_etc = etc.predict(x_val)
print('Accuracy of Extra Trees Classifier on val set: {:.2f}'.format(etc.score(x_val, y_val)))

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_etc))

Accuracy of Extra Trees Classifier on val set: 0.67
              precision    recall  f1-score   support

           0       0.36      0.18      0.24      1710
           1       0.72      0.91      0.80      8201
           2       0.34      0.11      0.17      2029

    accuracy                           0.67     11940
   macro avg       0.47      0.40      0.41     11940
weighted avg       0.60      0.67      0.61     11940



In [13]:
# with SMOTE
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, random_state=0)
etc.fit(x_res, y_res)

y_pred_etc = etc.predict(x_val)
print('Accuracy of Extra Trees Classifier on val set: {:.2f}'.format(etc.score(x_val, y_val)))

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_etc))

Accuracy of Extra Trees Classifier on val set: 0.65
              precision    recall  f1-score   support

           0       0.34      0.26      0.30      1710
           1       0.73      0.85      0.79      8201
           2       0.34      0.17      0.22      2029

    accuracy                           0.65     11940
   macro avg       0.47      0.43      0.44     11940
weighted avg       0.61      0.65      0.62     11940



### Self implemented ordinal classifier

In [49]:
# with data augmentation
from models import OrdinalClassifier

ordclf = OrdinalClassifier(learning_rate=0.1, max_iter=200)
ordclf.fit(x_train, y_train)

from sklearn.metrics import classification_report
print(classification_report(y_val, ordclf.predict(x_val)))

              precision    recall  f1-score   support

           0       0.14      0.61      0.22      1710
           1       0.66      0.15      0.25      8201
           2       0.18      0.22      0.20      2029

    accuracy                           0.23     11940
   macro avg       0.33      0.33      0.22     11940
weighted avg       0.50      0.23      0.23     11940



In [50]:
ordclf.clf_low.grad

array([ 8.98109461e-02,  1.05513564e-02,  1.04462099e-02, -4.25319645e-02,
        3.33203527e-02, -5.36796564e-02,  5.16733926e-02, -3.18877183e-02,
        6.07622037e-03,  6.21975006e-02,  2.72011427e-02,  2.82359140e-04,
        1.62821501e-04, -2.83623260e-03,  3.64001423e-02,  8.94496587e-03,
        7.37023136e-03,  1.24939562e-02,  1.52363775e-03,  3.21384203e-03,
        5.70883545e-03,  2.45649901e-03, -1.26814072e-03,  2.03212676e-03,
        3.95702187e-03,  7.42779265e-04,  8.26117141e-04,  7.72973473e-04,
        2.35754642e-03,  1.06251497e-03,  7.40311368e-04,  1.19620043e-03,
        1.26051712e-03,  7.47957315e-04,  2.70767323e-04,  3.42156326e-04,
        6.84124203e-04,  4.47633461e-04,  1.24972896e-04,  1.55916681e-04,
        2.76983943e-04,  1.52356970e-04,  9.40250759e-06, -5.37172478e-05,
        9.05575124e-06,  3.49787552e-04,  2.20129386e-04,  3.67405986e-04,
        1.35050269e-05,  6.82744308e-05,  3.73802326e-06,  1.07882738e-04,
        1.14681477e-04,  

In [51]:
# with split train pattern
random_split_aggr(model=OrdinalClassifier(learning_rate=0.1),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy(),
                    fit_args=dict())

(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6545, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))
(array([0, 1, 2], dtype=int8), array([32540,  6544, 30092], dtype=int64))


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.251256,0.328245,0.334387,0.233264,0.252401,2360,2022,7558
