In [144]:
import os
import sys
import importlib
import numpy as np
import pandas as pd
import scipy

sys.path.append(os.path.abspath('../src'))

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model, train_test_split
from preprocess import Preprocessor

df = fetch_train_data()

train_df, test_df = train_test_split(df, test_size=0.2)

train_df = Preprocessor().cleanse(train_df, is_train=True)
train_df.reset_index(drop=True, inplace=True)
train_df.dropna(subset=['fit'], inplace=True)

test_df['fit'].replace({
    'Small': '1',
    'True to Size': '2',
    'Large': '3'
},
                       inplace=True)
test_df = Preprocessor().cleanse(test_df)
test_df.reset_index(drop=True, inplace=True)
test_df.dropna(subset=['fit'], inplace=True)

desc_df = describe_data(test_df)
desc_df

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,11898,0,3
brand,category,11868,30,466
item_name,category,11898,0,3405
category,category,11898,0,68
size,category,10996,902,123
price,float64,11898,0,443
rented_for,category,10476,1422,8
usually_wear,float64,11848,50,24
age,float64,11553,345,61
height,float64,7869,4029,21


In [122]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot',
     OneHotEncoder(max_categories=400, handle_unknown='infrequent_if_exist')),
    ('imputer', SimpleImputer(strategy='mean')),
])


def get_user_data(df: pd.DataFrame, pipe: Pipeline = None) -> pd.DataFrame:
    user_data = df[[
        'age', 'height', 'weight', 'body_type', 'bust_size', 'cup_size'
    ]].copy()

    user_data['body_type'] = user_data['body_type'].astype(str, copy=False)
    user_data['body_type'].fillna('infrequent', inplace=True)

    user_data['cup_size'] = user_data['cup_size'].cat.codes
    user_data['cup_size'].replace(-1, np.nan, inplace=True)

    if pipe is None:
        pipe = ColumnTransformer([
            ('num', num_pipeline,
             ['age', 'height', 'weight', 'bust_size', 'cup_size']),
            ('cat', cat_pipeline, ['body_type']),
        ])
        user_data = pipe.fit_transform(user_data)
        # user_data['user_name'] = df['user_name'].astype(str)
        # user_data['user_name'].fillna('RTR Customer', inplace=True)
        return user_data, pipe
    else:
        user_data = pipe.transform(user_data)
        return user_data


user_data, pipe = get_user_data(train_df)
pd.DataFrame(user_data).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0,47929.0
mean,9.487937e-17,-1.587154e-15,2.244045e-15,1.497315e-16,-1.128175e-16,0.021928,0.165265,0.053162,0.201444,0.085898,0.075633,0.057502,0.339168
std,1.00001,1.00001,1.00001,1.00001,1.00001,0.146451,0.371424,0.224359,0.401083,0.280216,0.264412,0.232801,0.473432
min,-2.921737,-5.248166,-3.234057,-3.351189,-2.002336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7514496,-0.5668836,-0.5299837,-0.1020476,-0.8334317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.1313675,-0.09875534,-0.1504647,-0.1020476,-0.2489796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5920616,0.3693729,0.4188139,0.9809996,0.3354725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,5.862759,5.518784,8.483594,7.479283,5.01109,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [173]:
def get_Xy(df: pd.DataFrame):
    X = df[[
        'brand', 'category', 'item_name', 'size', 'price', 'rented_for',
        'usually_wear', 'age', 'height', 'weight', 'body_type', 'bust_size',
        'cup_size'
    ]].copy()
    y = df['fit'].cat.codes
    cat_cols = X.select_dtypes(include=['category']).columns
    X[cat_cols] = X[cat_cols].astype('object', copy=False)
    X[cat_cols] = X[cat_cols].fillna(np.nan)
    return X, y


X_train, y_train = get_Xy(train_df)
X_test, y_test = get_Xy(test_df)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, X_train.select_dtypes(include=['float64']).columns),
    ('cat', cat_pipeline, X_train.select_dtypes(include=['object']).columns),
])

X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
smote = SMOTE(random_state=0)


def fit_model_with_user_vector(n_clusters):
    # from kmodes.kprototypes import KPrototypes

    # kproto = KPrototypes(n_clusters=100, n_init=8, n_jobs=8, verbose=2, init='Cao')
    # clusters = kproto.fit_predict(user_df, categorical=[0, 4])

    from sklearn.cluster import KMeans

    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(user_data)

    X_train_uv = pd.DataFrame([kmeans.cluster_centers_[c] for c in clusters])
    X_train_uv.columns = [f'uv_{c}' for c in X_train_uv.columns]

    c_pred = kmeans.predict(get_user_data(test_df, pipe))
    X_test_uv = pd.DataFrame([kmeans.cluster_centers_[c] for c in c_pred])
    X_test_uv.columns = [f'uv_{c}' for c in X_test_uv.columns]

    X_train_ext = scipy.sparse.hstack([X_train, X_train_uv])
    X_test_ext = scipy.sparse.hstack([X_test, X_test_uv])
    y_train_ext = y_train.copy()

    X_train_ext, y_train_ext = smote.fit_resample(X_train_ext, y_train_ext)
    log_reg.fit(X_train_ext, y_train_ext)
    y_train_pred = log_reg.predict(X_train_ext)
    y_test_pred = log_reg.predict(X_test_ext)

    res = pd.concat([
        evaluate_model(y_train_ext, y_train_pred),
        evaluate_model(y_test, y_test_pred)
    ])
    res.index = [f'train (n={n_clusters})', f'test (n={n_clusters})']
    return res


n_list = [100 * i for i in range(1, 6)]

res = pd.concat([fit_model_with_user_vector(n) for n in n_list])

# without user cluster info
X_train, y_train = smote.fit_resample(X_train, y_train)
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

res = pd.concat([
    res,
    evaluate_model(y_train, y_train_pred, index='train (no user vector)'),
    evaluate_model(y_test, y_test_pred, index='test (no user vector)')
])
res.sort_values('f1', ascending=False).applymap(lambda x: f'{x:.2%}')

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted
train (n=500),60.10%,59.75%,60.10%,59.82%,59.82%
train (n=300),60.02%,59.66%,60.02%,59.74%,59.74%
train (n=200),59.99%,59.62%,59.99%,59.70%,59.70%
train (n=400),59.94%,59.59%,59.94%,59.67%,59.67%
train (n=100),59.90%,59.55%,59.90%,59.63%,59.63%
train (no user vector),59.83%,59.47%,59.83%,59.54%,59.54%
test (n=500),49.16%,46.04%,53.30%,45.24%,51.88%
test (n=200),48.87%,46.03%,53.34%,45.11%,51.59%
test (n=400),49.02%,45.86%,53.06%,45.09%,51.73%
test (n=300),48.84%,45.82%,53.03%,44.98%,51.56%
