## Initialize

In [190]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import *

df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

prep = Preprocessor()
train_df = prep.cleanse(train_df, is_train=True)
train_df.reset_index(drop=True, inplace=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df['fit'].replace({
    'Small': '1',
    'True to Size': '2',
    'Large': '3'
},
                       inplace=True)
test_df = prep.cleanse(test_df)
test_df.reset_index(drop=True, inplace=True)
test_df.dropna(subset=['fit'], inplace=True)

## Transform data

In [199]:
prep.pipeline = [
    DropColumns([
        'user_name', 'review', 'review_summary', 'rating', 'size', 'item_name'
    ]),
    OneHotEncoder([
        'size_scheme', 'size_main', 'size_suffix', 'brand', 'category',
        'rented_for', 'body_type'
    ],
                  name='one_hot'),
    OrdinalEncoder(['fit', 'cup_size']),
    StandardScaler(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MinMaxScaler(['price', 'usually_wear']),
    SelectOutputColumns(
        'one_hot'
    ),  # append the output of 'one_hot' to the input of the next transformer
    MeanImputer(['age', 'weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(['usually_wear']),
]

train_df_prep, test_df_prep = train_df.copy(), test_df.copy()
train_df_prep = prep.fit_transform(train_df_prep)
test_df_prep = prep.transform(test_df_prep)

describe_data(test_df_prep)['nan_count'].sum()


<class 'preprocess.DropColumns'>
<class 'preprocess.OneHotEncoder'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.StandardScaler'>
<class 'preprocess.MinMaxScaler'>
<class 'preprocess.SelectOutputColumns'>
<class 'preprocess.MeanImputer'>
<class 'preprocess.MedianImputer'>


0

## Profiling report

In [8]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(train_df, minimal=True)
# profile.to_notebook_iframe()

## Auto ML with PyCaret
Last modified: Jan 7

In [13]:
from pycaret.classification import *
import warnings

numeric_features = [
    'price', 'usually_wear', 'age', 'height', 'weight', 'bust_size'
]
categorical_features = [
    'category', 'brand', 'item_name', 'size', 'rented_for', 'body_type'
]
ordinal_features = {
    'cup_size':
    ['AA', 'A', 'B', 'C', 'D', 'D+', 'DD', 'DDD/E', 'F', 'G', 'H', 'I', 'J']
}

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    setup(
        data=train_df.drop(['user_name', 'review', 'review_summary', 'rating'],
                           axis=1),
        test_data=test_df,
        target='fit',
        numeric_features=numeric_features,
        categorical_features=categorical_features,
        ordinal_features=ordinal_features,
        preprocess=False,
        session_id=0,
    )


Unnamed: 0,Description,Value
0,Session id,0
1,Target,fit
2,Target type,Multiclass
3,Target mapping,"1: 0, 2: 1, 3: 2"
4,Original data shape,"(59827, 14)"
5,Transformed data shape,"(110064, 27)"
6,Transformed train set shape,"(98166, 27)"
7,Transformed test set shape,"(11898, 27)"
8,Ordinal features,1
9,Numeric features,6


In [14]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6906,0.6772,0.6906,0.6544,0.6472,0.2073,0.2327,4.867
rf,Random Forest Classifier,0.6803,0.6232,0.6803,0.6233,0.5963,0.0857,0.1262,6.437
gbc,Gradient Boosting Classifier,0.5703,0.5934,0.5703,0.5867,0.5745,0.1424,0.1444,27.781
lda,Linear Discriminant Analysis,0.513,0.688,0.513,0.6427,0.5318,0.2338,0.2628,1.308
lr,Logistic Regression,0.5076,0.6902,0.5076,0.6421,0.5257,0.2312,0.2616,9.479
lightgbm,Light Gradient Boosting Machine,0.5037,0.5247,0.5037,0.5404,0.5087,0.0476,0.0497,2.527
dt,Decision Tree Classifier,0.4843,0.5531,0.4843,0.5697,0.5086,0.108,0.1147,1.721
svm,SVM - Linear Kernel,0.4545,0.0,0.4545,0.5208,0.3862,0.1263,0.1802,4.772
qda,Quadratic Discriminant Analysis,0.4401,0.6083,0.4401,0.5828,0.4607,0.1405,0.1615,1.32
ada,Ada Boost Classifier,0.4373,0.5727,0.4373,0.583,0.4514,0.1425,0.166,3.215


In [17]:
best_model

## Logistic Regression
Last modified: Jan 3

-   NaN dropped
-   Oversampled data

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted
0,0.583762,0.578526,0.583762,0.57629,0.57629
0,0.401826,0.404246,0.481876,0.361264,0.443726
