## Data Preprocessing

In [58]:
import importlib

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = ['serif']
mpl.rcParams['font.serif'] = ['Times New Roman']
mpl.rcParams['mathtext.fontset'] = 'cm'

import utils
import preprocess

importlib.reload(utils)
importlib.reload(preprocess)

from utils import fetch_train_data, describe_data, evaluate_model
from preprocess import Preprocessor

train_df = fetch_train_data()
train_df = Preprocessor().transform(train_df)
describe_data(train_df)

Unnamed: 0,dtype,valid_count,nan_count,unique_count
fit,category,59827,27749,3
item_name1,category,87402,174,487
item_name2,category,87576,0,4111
size,category,81218,6358,159
price,float64,87576,0,478
rented_for,category,80076,7500,8
usually_wear,float64,87013,563,50
age,float64,85416,2160,84
height,float64,55981,31595,24
weight,float64,70069,17507,173


## Logistic Regression
-   NaN dropped
-   Undersampled data

In [55]:
df = train_df.dropna()

# split train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('fit', axis=1),
                                                    df['fit'],
                                                    test_size=0.2)

# feature engineering
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

num_cols = X_train.select_dtypes(include=['float64']).columns
cat_cols = X_train.select_dtypes(include=['category']).columns

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

# under sampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_train_prepared, y_train = rus.fit_resample(X_train_prepared, y_train)

# train model
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='multinomial', max_iter=100, C=0.01)
log_reg.fit(X_train_prepared, y_train)

# evaluate model
from sklearn.metrics import accuracy_score, f1_score

y_train_pred = log_reg.predict(X_train_prepared)
y_test_pred = log_reg.predict(X_test_prepared)

pd.concat([
    evaluate_model(y_train, y_train_pred),
    evaluate_model(y_test, y_test_pred)
])

Unnamed: 0,accuracy,precision,recall,f1,f1_weighted
0,0.466856,0.46492,0.466856,0.464957,0.464957
0,0.364726,0.376189,0.424672,0.320761,0.412335
