# Test Script

## Data input & Preprocess

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

PATH = '../data/train_data_all_filled.json'
#df = pd.read_json(PATH)

# Using orginal data (No BERT predict labels)
df = fetch_train_data()

In [2]:
train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11940,0,3
item_name,object,11898,42,3451
brand,object,11910,30,505
category,object,11898,42,68
size,object,11940,0,124
size_main,object,10981,959,52
size_suffix,object,1621,10319,5
size_scheme,object,11861,79,4
price,float64,11898,42,443
rented_for,object,10476,1464,8


In [3]:
train_df = prep.handle_size_mapping(train_df, is_train=True)
test_df = prep.handle_size_mapping(test_df, is_train=False)

In [4]:
prep.pipeline = [
    DropColumns(
        cols=['user_name', 'review', 'review_summary', 'rating', 'item_name']),
    OneHotEncoder(cols=[
        'size_scheme', 'size_main', 'size_suffix', 'rented_for', 'body_type'
    ],
                  name='one_hot'),
    OrdinalEncoder(cols=['fit', 'cup_size']),
    StandardScaler(cols=['age', 'weight', 'height', 'bust_size', 'cup_size']),
    TargetEncoder(cols=['brand', 'category', 'size'],
                  target_cols=['weight', 'height', 'bust_size', 'cup_size'],
                  name='target_encoder'),
    DropColumns(cols=['brand', 'category', 'size']),
    MinMaxScaler(cols=['price', 'usually_wear']),
    SelectOutputColumns(
        target='target_encoder'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(cols=['age', 'weight', 'height', 'bust_size', 'cup_size', 'price']),
    MedianImputer(cols=['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

# describe_data(train_df_prep)['nan_count'].sum()
describe_data(train_df_prep)

<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.TargetEncoder'>
<class 'preprocess.DropColumns'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,int8,47929,0,3
price,float64,47929,0,464
usually_wear,float64,47929,0,48
age,float64,47929,0,75
height,float64,47929,0,25
...,...,...,...,...
category_cup_size,float64,47929,0,69
size_weight,float64,47929,0,134
size_height,float64,47929,0,126
size_bust_size,float64,47929,0,115


In [5]:
from utils import data_augmentation

x_train_noaug = train_df_prep.drop('fit', axis=1)
y_train_noaug = train_df_prep['fit']

train_df_prep_aug = data_augmentation(train_df_prep, ['cup_size', 'bust_size', 'weight', 'height'], large_ratio=2.7, small_ratio=3.6)
train_df_prep_aug['fit'].value_counts()

1    32722
0    32540
2    30092
Name: fit, dtype: int64

In [6]:
x_train_noaug['cup_size']

3       -2.762857e-01
5        2.926453e+00
6        1.858873e+00
7        2.575040e-01
8        2.575040e-01
             ...     
70208   -1.343865e+00
70209    2.575040e-01
70210    2.575040e-01
70211   -8.100755e-01
70212    5.385629e-17
Name: cup_size, Length: 47929, dtype: float64

In [7]:
x_train = train_df_prep_aug.drop('fit', axis=1)
x_val = test_df_prep.drop('fit', axis=1)
y_train = train_df_prep_aug['fit']
y_val = test_df_prep['fit']

## Model Evaluation

### Self designed Logistic classifier

In [8]:
# with data augmentation
from models import LogisticClassifier
from utils import evaluate_model

clf = LogisticClassifier(max_iter=1000, learning_rate=0.1, random_state=10, alpha=0.5)
#w0 = np.random.randn(x_train.shape[1] + 1, 3)
clf.fit(x_train, y_train, verbose=False)


print('==self implemented clf==')

y_pred_clf = clf.predict(x_val)
print(f'val score: {np.mean(y_pred_clf == y_val)}')
print(evaluate_model(y_val, y_pred_clf))

y_pred_clf = clf.predict(x_train)
print(f'Train score: {np.mean(y_pred_clf == y_train)}')
print(evaluate_model(y_train, y_pred_clf))

100%|██████████| 1000/1000 [00:45<00:00, 22.21it/s]


==self implemented clf==
val score: 0.4886934673366834
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.488693   0.354471  0.357644  0.352428     0.509886    2078   

        #true2size  #large  
result        7064    2798  
Train score: 0.7340961050401661
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.734096   0.731257  0.735502  0.732124     0.731487   33825   

        #true2size  #large  
result       28999   32530  


In [9]:
# gradient descent has almost ended
clf.grad

array([[ 6.24500451e-17,  6.24500451e-17, -1.56125113e-17],
       [-7.80625564e-18,  4.33680869e-18, -1.95156391e-18],
       [ 8.67361738e-18,  8.67361738e-18,  1.08420217e-18],
       [ 4.33680869e-19,  4.33680869e-18,  1.08420217e-18],
       [ 9.71445147e-17,  1.73472348e-17,  1.11022302e-16],
       [-9.71445147e-17, -1.73472348e-17, -1.11022302e-16],
       [-1.38777878e-16, -6.93889390e-18, -1.24900090e-16],
       [ 8.32667268e-17, -6.93889390e-18,  1.24900090e-16],
       [ 1.21430643e-17,  1.21430643e-17,  3.46944695e-17],
       [ 2.77555756e-17, -4.85722573e-17,  3.90312782e-18],
       [-6.93889390e-18, -1.73472348e-18,  3.46944695e-18],
       [-4.87890978e-19,  2.43945489e-19,  6.77626358e-20],
       [ 6.09863722e-20, -1.21972744e-19,  6.77626358e-20],
       [-1.21430643e-17,  1.21430643e-17, -6.07153217e-18],
       [-1.56125113e-17, -1.56125113e-17, -4.33680869e-18],
       [-8.67361738e-18, -4.33680869e-18,  2.16840434e-18],
       [ 4.33680869e-18,  6.07153217e-18

In [10]:
# Compare data augmentation to SMOTE algorithm
from imblearn.over_sampling import SMOTE

sm = SMOTE()
x_res, y_res = sm.fit_resample(x_train_noaug, y_train_noaug)

clf_sm = LogisticClassifier(max_iter=100, learning_rate=0.25, random_state=4, alpha=0.5)
#w0 = np.random.randn(x_train.shape[1] + 1, 3)
clf_sm.fit(x_res, y_res, verbose=False)


print('==self implemented clf SMOTE==')
print(f'train score: {clf_sm.score()}')


y_pred_clf_sm = clf_sm.predict(x_val)
print(f'val score: {np.mean(y_pred_clf_sm == y_val)}')
print(evaluate_model(y_val, y_pred_clf_sm))

y_pred_clf_sm = clf_sm.predict(x_res)
print(f'train score: {np.mean(y_pred_clf_sm == y_res)}')
print(evaluate_model(y_res, y_pred_clf_sm))

100%|██████████| 100/100 [00:04<00:00, 20.59it/s]


==self implemented clf SMOTE==
train score: 0.42294684513986514
val score: 0.39782244556113905
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.397822   0.376557  0.399602  0.346463     0.436883    4526   

        #true2size  #large  
result        4506    2908  
train score: 0.42294684513986514
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.422947   0.423165  0.422947  0.421543     0.421543   37094   

        #true2size  #large  
result       33356   27716  


In [11]:
# Using split train pattern
from utils import random_split_aggr

random_split_aggr(model=LogisticClassifier(max_iter=100, learning_rate=0.25, random_state=42),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy(),
                    fit_args=dict())

 10%|█         | 10/100 [00:00<00:00, 96.57it/s]

(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))


100%|██████████| 100/100 [00:01<00:00, 94.75it/s]
 10%|█         | 10/100 [00:00<00:00, 96.49it/s]

(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))


100%|██████████| 100/100 [00:01<00:00, 94.69it/s]
  9%|▉         | 9/100 [00:00<00:01, 88.69it/s]

(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))


100%|██████████| 100/100 [00:01<00:00, 93.26it/s]
 10%|█         | 10/100 [00:00<00:00, 90.69it/s]

(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))


100%|██████████| 100/100 [00:01<00:00, 94.18it/s]
 10%|█         | 10/100 [00:00<00:00, 93.13it/s]

(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))


100%|██████████| 100/100 [00:01<00:00, 94.51it/s]


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.308124,0.339589,0.346689,0.284087,0.339296,3233,3250,5457


### Logistic Regression from sklearn 

In [12]:
# with data augmentation
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(x_train, y_train)

y_pred_log = logreg.predict(x_val.to_numpy())
print('Accuracy of logistic regression classifier on val set: {:.2f}'.format(logreg.score(x_val, y_val)))
print(evaluate_model(y_val, y_pred_log))

Accuracy of logistic regression classifier on val set: 0.64
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.643049   0.435069  0.391161  0.395445     0.600735     811   

        #true2size  #large  
result        9938    1191  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# with SMOTE

logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(x_res, y_res)

y_pred_log = logreg.predict(x_val.to_numpy())
print('Accuracy of logistic regression classifier on val set: {:.2f}'.format(logreg.score(x_val, y_val)))
print(evaluate_model(y_val, y_pred_log))

Accuracy of logistic regression classifier on val set: 0.54
        accuracy  precision    recall        f1  f1_weighted  #small  \
result  0.536348   0.411312  0.426228  0.414433     0.554923    2087   

        #true2size  #large  
result        7093    2760  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# with split train pattern
random_split_aggr(model=LogisticRegression(max_iter=2000),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy())

(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.354606,0.414443,0.447408,0.345094,0.374789,3284,2804,5852


### Extra Trees Classifier

In [15]:
# with data augmentation
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, random_state=0)
etc.fit(x_train, y_train)

y_pred_etc = etc.predict(x_val)
print('Accuracy of Extra Trees Classifier on val set: {:.2f}'.format(etc.score(x_val, y_val)))

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_etc))

Accuracy of Extra Trees Classifier on val set: 0.68
              precision    recall  f1-score   support

           0       0.39      0.18      0.25      1710
           1       0.72      0.92      0.81      8201
           2       0.36      0.11      0.17      2029

    accuracy                           0.68     11940
   macro avg       0.49      0.40      0.41     11940
weighted avg       0.61      0.68      0.62     11940



In [16]:
# with SMOTE
etc = ExtraTreesClassifier(n_estimators=100, random_state=0)
etc.fit(x_res, y_res)

y_pred_etc = etc.predict(x_val)
print('Accuracy of Extra Trees Classifier on val set: {:.2f}'.format(etc.score(x_val, y_val)))

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_etc))

Accuracy of Extra Trees Classifier on val set: 0.65
              precision    recall  f1-score   support

           0       0.34      0.26      0.30      1710
           1       0.73      0.85      0.79      8201
           2       0.34      0.17      0.22      2029

    accuracy                           0.65     11940
   macro avg       0.47      0.43      0.44     11940
weighted avg       0.61      0.65      0.62     11940



### Self implemented ordinal classifier

In [17]:
# with data augmentation
from models import OrdinalClassifier

ordclf = OrdinalClassifier(learning_rate=0.1, max_iter=200)
ordclf.fit(x_train, y_train)

from sklearn.metrics import classification_report
print(classification_report(y_val, ordclf.predict(x_val)))

              precision    recall  f1-score   support

           0       0.14      0.77      0.24      1710
           1       0.64      0.13      0.22      8201
           2       0.20      0.08      0.12      2029

    accuracy                           0.21     11940
   macro avg       0.33      0.33      0.19     11940
weighted avg       0.49      0.21      0.20     11940



In [18]:
ordclf.clf_low.grad

array([ 3.99360322e-02,  4.55244217e-03,  4.43506972e-03, -2.04974877e-02,
       -2.59579438e-02, -2.05136599e-03,  2.77149227e-03,  3.76588356e-02,
       -3.89235973e-02,  2.12639320e-02,  1.83942268e-02,  2.21307497e-04,
        6.64292744e-05,  7.57311823e-03, -7.35775090e-04,  5.98340932e-03,
        8.48450828e-03,  2.05092626e-03,  7.09816216e-03,  1.74177037e-03,
        4.54216199e-03, -1.78124785e-03,  4.92384300e-03, -3.09456136e-04,
        1.05115824e-03,  1.23842084e-04,  8.47139141e-05,  1.72454021e-04,
        5.72505764e-04,  5.22363888e-04,  4.54350406e-04,  5.96023435e-04,
        4.05384395e-04, -1.46112977e-04,  1.07827488e-04,  2.41055482e-04,
        1.77228423e-04,  3.22814542e-04,  1.38212677e-04,  6.39046153e-05,
        2.39580040e-04,  1.76569574e-04,  9.77864814e-05,  9.39893839e-05,
        1.08279751e-04, -2.44911625e-05,  5.13599015e-05,  2.05270170e-04,
       -7.99543145e-05,  4.51297099e-05,  6.18303299e-05, -3.44017127e-05,
        1.30957699e-04,  

In [19]:
# with split train pattern
random_split_aggr(model=OrdinalClassifier(learning_rate=0.1),
                    X_train=x_train_noaug.to_numpy(), X_test=x_val.to_numpy(),
                    y_train=y_train_noaug.to_numpy(), y_test=y_val.to_numpy(),
                    fit_args=dict())

(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6545, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))
(array([0, 1, 2], dtype=int8), array([7074, 6544, 8133], dtype=int64))


Unnamed: 0,accuracy,precision,recall,f1,f1_weighted,#small,#true2size,#large
result,0.227219,0.331203,0.331697,0.226161,0.224032,4684,1582,5674
