## Initialize

In [231]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

df = fetch_train_data(path='../data/train_data_all_filled.json')
# df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df = prep.cleanse(test_df)
test_df.dropna(subset=['fit'], inplace=True)

describe_data(test_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,17511,0,3
item_name,object,17511,0,3604
brand,object,17476,35,476
category,object,17511,0,68
size,object,17511,0,137
size_main,object,16194,1317,59
size_suffix,object,2387,15124,5
size_scheme,object,17437,74,4
price,float64,17511,0,454
rented_for,object,16064,1447,8


## Transform data

In [232]:
prep.pipeline = [
    DropColumns([
        'user_name', 'review', 'review_summary', 'rating', 'size', 'item_name'
    ]),
    OneHotEncoder([
        'size_scheme', 'size_main', 'size_suffix', 'brand', 'category',
        'rented_for', 'body_type'
    ],
                  name='one_hot'),
    OrdinalEncoder(['fit', 'cup_size']),
    StandardScaler(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MinMaxScaler(['price', 'usually_wear']),
    SelectOutputColumns(
        'one_hot'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

describe_data(train_df_prep)['nan_count'].sum()


<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


0

## Profiling report

In [223]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(test_df, minimal=True)
# profile.to_notebook_iframe()

## Auto ML with PyCaret
Last modified: Jan 7

In [210]:
from pycaret.classification import *
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df_prep,
        test_data=test_df_prep,
        target='fit',
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Original data shape,"(59827, 655)"
4,Transformed data shape,"(59827, 655)"
5,Transformed train set shape,"(47929, 655)"
6,Transformed test set shape,"(11898, 655)"
7,Numeric features,654
8,Rows with missing values,80.1%


In [211]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6827,0.5,0.6827,0.4661,0.554,0.0,0.0,0.028


[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

In [17]:
best_model

## Logistic Regression
Last modified: Jan 3

-   NaN dropped
-   Oversampled data

## Ordinal Regression with statsmodels

In [215]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

model = OrderedModel(
    train_df_prep['fit'],
    train_df_prep.drop('fit', axis=1),
    distr='logit',
    
)
model.fit()

model.summary()

ValueError: There should not be a constant in the model