# Clothes Size Classification

## Preprocessing

In [14]:
import pandas as pd
import numpy as np

In [15]:
data = pd.read_csv('../data/raw_data.csv')
data

Unnamed: 0,weight,age,height,size
0,62,28.0,172.72,XL
1,59,36.0,167.64,L
2,61,34.0,165.10,M
3,65,27.0,175.26,L
4,62,45.0,172.72,M
...,...,...,...,...
119729,63,42.0,175.26,M
119730,45,29.0,154.94,S
119731,61,31.0,172.72,M
119732,74,31.0,167.64,XL


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119734 entries, 0 to 119733
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   weight  119734 non-null  int64  
 1   age     119477 non-null  float64
 2   height  119404 non-null  float64
 3   size    119734 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.7+ MB


In [17]:
# 1. NaN processing
data.isnull().sum()   # age: 257, height: 330

# Drop NaN
data.dropna(inplace=True)
data

Unnamed: 0,weight,age,height,size
0,62,28.0,172.72,XL
1,59,36.0,167.64,L
2,61,34.0,165.10,M
3,65,27.0,175.26,L
4,62,45.0,172.72,M
...,...,...,...,...
119729,63,42.0,175.26,M
119730,45,29.0,154.94,S
119731,61,31.0,172.72,M
119732,74,31.0,167.64,XL


In [18]:
# 2. age dtype: float -> int
data = data.astype({'age':'int'})
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119153 entries, 0 to 119733
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   weight  119153 non-null  int64  
 1   age     119153 non-null  int32  
 2   height  119153 non-null  float64
 3   size    119153 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 4.1+ MB


In [19]:
# 3. height: rounded to one decimal place
data['height'] = round(data['height'], 1)
data

Unnamed: 0,weight,age,height,size
0,62,28,172.7,XL
1,59,36,167.6,L
2,61,34,165.1,M
3,65,27,175.3,L
4,62,45,172.7,M
...,...,...,...,...
119729,63,42,175.3,M
119730,45,29,154.9,S
119731,61,31,172.7,M
119732,74,31,167.6,XL


### Change the Age Range

In [20]:
data.describe()        # Age ranges from 0 to 117 -> unreliable

Unnamed: 0,weight,age,height
count,119153.0,119153.0,119153.0
mean,61.756095,34.032714,165.808586
std,9.942877,8.148302,6.737398
min,22.0,0.0,137.2
25%,55.0,29.0,160.0
50%,61.0,32.0,165.1
75%,67.0,37.0,170.2
max,136.0,117.0,193.0


In [21]:
# Age range
data.age.value_counts().tail(20)

3      8
71     8
4      8
72     7
5      4
76     4
73     4
14     3
77     2
87     2
85     1
81     1
100    1
113    1
88     1
92     1
99     1
9      1
1      1
112    1
Name: age, dtype: int64

In [22]:
# Data is narrowed down with age between 20~60
data1 = data[(data.age >= 20) & (data.age <= 60)]
data1

Unnamed: 0,weight,age,height,size
0,62,28,172.7,XL
1,59,36,167.6,L
2,61,34,165.1,M
3,65,27,175.3,L
4,62,45,172.7,M
...,...,...,...,...
119729,63,42,175.3,M
119730,45,29,154.9,S
119731,61,31,172.7,M
119732,74,31,167.6,XL


In [23]:
# Export to csv
data1.to_csv('../data/data1.csv')
print('Export completed-------------------------------------')

Export completed-------------------------------------


### Remove Outliers

In [24]:
data1.describe()

Unnamed: 0,weight,age,height
count,117375.0,117375.0,117375.0
mean,61.772124,33.877504,165.814159
std,9.939778,7.454046,6.741833
min,22.0,20.0,137.2
25%,55.0,29.0,160.0
50%,61.0,32.0,165.1
75%,67.0,37.0,170.2
max,136.0,60.0,193.0


In [25]:
# Calculate IQR
q1 = data1.quantile(0.25)    # Return a series
q3 = data1.quantile(0.75)
IQR = q3 - q1
outlier_con = (data1 < q1 - 1.5 * IQR) | (data1 > q3 + 1.5 * IQR)
outlier_con = outlier_con.any(axis=1)   # When there's at least one True, it returns True
outlier_df = data1[outlier_con]    # Outliers
outlier_df

  outlier_con = (data1 < q1 - 1.5 * IQR) | (data1 > q3 + 1.5 * IQR)
  outlier_con = (data1 < q1 - 1.5 * IQR) | (data1 > q3 + 1.5 * IQR)


Unnamed: 0,weight,age,height,size
22,55,50,165.1,S
33,86,48,172.7,XXXL
40,58,52,162.6,XXXL
73,113,27,167.6,XXXL
74,88,30,162.6,XXXL
...,...,...,...,...
119685,87,29,165.1,XXXL
119699,63,50,160.0,XXXL
119704,70,55,165.1,XXXL
119710,77,51,165.1,XXXL


In [26]:
data2 = data1.drop(outlier_df.index, axis=0)
data2     # Data without outliers

Unnamed: 0,weight,age,height,size
0,62,28,172.7,XL
1,59,36,167.6,L
2,61,34,165.1,M
3,65,27,175.3,L
4,62,45,172.7,M
...,...,...,...,...
119729,63,42,175.3,M
119730,45,29,154.9,S
119731,61,31,172.7,M
119732,74,31,167.6,XL


In [27]:
# Export to excel
data2.to_csv('../data/data2.csv')
print('Export completed------------------------------------')

Export completed------------------------------------


### Remove Outliers by Size

In [28]:
# Define function to remove outlier
def remove_outliers(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    IQR = q3 - q1
    condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
    condition = condition.any(axis=1)   # True: outliers
    outlier_index = df[condition].index
    result = df.drop(outlier_index, axis=0)
    return result

if __name__ == '__main__':      # Test the function
    remove_outliers(data2)

  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)


In [29]:
# Groupby & apply the function above
df = data2.groupby(['size']).apply(lambda x: remove_outliers(x))
data3 = df.set_index(df.index.get_level_values(1))      # Remove multi-index
data3

  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)
  condition = (df < q1 - 1.5 * IQR) | (df > q3 + 1.5 * IQR)


Unnamed: 0,weight,age,height,size
1,59,36,167.6,L
3,65,27,175.3,L
12,64,26,165.1,L
14,65,33,165.1,L
15,63,30,167.6,L
...,...,...,...,...
119692,76,30,167.6,XXXL
119707,72,44,162.6,XXXL
119720,58,35,167.6,XXXL
119721,62,40,160.0,XXXL


In [30]:
# Export to csv
data3.to_csv('../data/data3.csv')
print('Export completed-------------------------------------------')

Export completed-------------------------------------------


In [31]:
data3.describe()

Unnamed: 0,weight,age,height
count,104928.0,104928.0,104928.0
mean,60.843388,32.721523,165.716827
std,7.995893,5.913624,6.55987
min,41.0,20.0,144.8
25%,55.0,29.0,160.0
50%,61.0,32.0,165.1
75%,65.0,36.0,170.2
max,85.0,49.0,185.4


## Explore Data

In [32]:
# Check what sizes there are
data1['size'].unique()     # XXS ~ XXXL withoug XS

array(['XL', 'L', 'M', 'S', 'XXS', 'XXXL', 'XXL'], dtype=object)

In [33]:
# Check data balance
data1['size'].value_counts()   # Very small number in XXL

M       29191
S       21515
XXXL    20809
XL      18785
L       17249
XXS      9760
XXL        66
Name: size, dtype: int64

## Train/Test Split

In [34]:
from sklearn.model_selection import train_test_split

In [37]:
# Import data
data = pd.read_csv('../data/data3.csv', index_col=0)
data.tail()

Unnamed: 0,weight,age,height,size
119692,76,30,167.6,XXXL
119707,72,44,162.6,XXXL
119720,58,35,167.6,XXXL
119721,62,40,160.0,XXXL
119727,68,28,170.2,XXXL


In [38]:
X = data[['age', 'height', 'weight']]
y = data['size']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
X_train.shape, y_test.shape

((73449, 3), (31479,))

## X data standardization

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
# X_train: fit & transform
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_train_sc

array([[-0.80196453,  1.06632893,  0.89425032],
       [ 0.54752799, -0.09407571, -0.60708781],
       [-0.63327797, -0.09407571,  0.51891579],
       ...,
       [-0.80196453, -0.47578776, -0.35686479],
       [ 1.39096081,  1.84502152,  0.51891579],
       [ 1.05358768, -1.65146089, -0.85731083]])

In [42]:
# X_test: transform
X_test_sc = scaler.transform(X_test)
X_test_sc

array([[-0.4645914 ,  1.46330947,  0.51891579],
       [ 2.57176676,  2.22673357,  0.89425032],
       [ 0.21015486, -1.65146089,  0.01846974],
       ...,
       [-0.80196453, -0.47578776, -1.73309141],
       [-0.97065109,  1.46330947,  0.89425032],
       [-0.29590484, -0.47578776,  0.01846974]])

## Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
logi = LogisticRegression(solver='saga')    # multiclass solver needed
logi.fit(X_train_sc, y_train)

LogisticRegression(solver='saga')

In [45]:
# Accuracy
logi.score(X_test_sc, y_test)

0.5141522920041933

In [46]:
# Classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, logi.predict(X_test_sc), zero_division=0))   # Due to the warning, zero_division is set

              precision    recall  f1-score   support

           L       0.37      0.14      0.20      4809
           M       0.48      0.67      0.56      8262
           S       0.50      0.50      0.50      5787
          XL       0.43      0.47      0.45      5115
         XXL       0.00      0.00      0.00        10
         XXS       0.61      0.46      0.52      2722
        XXXL       0.73      0.73      0.73      4774

    accuracy                           0.51     31479
   macro avg       0.44      0.42      0.42     31479
weighted avg       0.51      0.51      0.50     31479



> **Accuracy: 50.4%, f1 score(weighted): 0.49**

The warning appears without zero_division, as there are some labels that appear in `y_test` but not in `y_pred`. The label can be found through processes as below, and it is `XXL`. References: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi

In [47]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, logi.predict(X_test_sc))

array([[ 670, 2274,  157, 1513,    0,    2,  193],
       [ 540, 5515, 1466,  639,    0,   81,   21],
       [  21, 2176, 2881,    1,    0,  708,    0],
       [ 495, 1113,   55, 2388,    0,    0, 1064],
       [   0,    2,    0,    4,    0,    0,    4],
       [   0,  220, 1255,    0,    0, 1247,    0],
       [ 103,  139,    0, 1048,    0,    0, 3484]], dtype=int64)

In [48]:
logi.classes_    # The order of labels above

array(['L', 'M', 'S', 'XL', 'XXL', 'XXS', 'XXXL'], dtype=object)

In [49]:
# y_pred
pd.DataFrame(logi.predict(X_test_sc)).value_counts()

M       11439
S        5814
XL       5593
XXXL     4766
XXS      2038
L        1829
dtype: int64

In [50]:
set(y_test) - set(logi.predict(X_test_sc))

{'XXL'}

## Support Vector Machine

In [51]:
from sklearn.svm import SVC    # Support Vector Classification

* C: the bigger, the harder margin
* gamma: the bigger, the more overfitted

In [52]:
svm = SVC()
svm.fit(X_train_sc, y_train)   # Takes 3m

SVC()

In [53]:
# Explore trributes
svm.classes_

array(['L', 'M', 'S', 'XL', 'XXL', 'XXS', 'XXXL'], dtype=object)

In [54]:
svm.fit_status_    # Correctly fitted: 0

0

In [55]:
len(svm.support_)    # Indices of support vectors

66423

In [56]:
svm.support_vectors_

array([[-0.80196453,  1.06632893,  0.89425032],
       [-0.63327797,  0.68461688,  0.51891579],
       [ 0.04146829,  0.68461688,  0.01846974],
       ...,
       [ 0.71621455,  1.06632893,  1.39469636],
       [ 0.21015486, -0.47578776, -0.4819763 ],
       [ 1.05358768, -0.09407571,  1.14447334]])

In [57]:
svm.shape_fit_    # (dimensions of training vector X, features)

(73449, 3)

In [58]:
# Evaluation
svm.score(X_test_sc, y_test)

0.5141522920041933

### Grid Search for Hyperparameters

In [59]:
from sklearn.model_selection import GridSearchCV

#### Grid Search Model 1

In [60]:
svm = SVC()
params = {'kernel':['poly', 'rbf'], 'C':[0.1, 1], 'gamma':[0.5, 1]} 
clf = GridSearchCV(svm, params, cv=5)

In [61]:
clf.fit(X_train_sc, y_train)

KeyboardInterrupt: 

In [None]:
clf.cv_results_    # rbf showed the explicitly better result

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
# Evaluation
clf.score(X_test_sc, y_test)

#### Grid Search Model 2

In [None]:
svm = SVC()
params = {'kernel':['rbf'], 'C':[0.01, 0.05], 'gamma':[1, 5]} 
clf = GridSearchCV(svm, params, cv=5)

In [None]:
clf.fit(X_train_sc, y_train)

In [None]:
clf.cv_results_

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
# Evaluation
clf.score(X_test_sc, y_test)

> **Best parameter among above: `kernel` = rbf, `C` = 0.05, `gamma` = 5** <br>
> **Accuracy: 50.6%**

## K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Grid search for the number of neighbors
knn = KNeighborsClassifier()
params = {'n_neighbors': range(5, 52, 2)}
clf = GridSearchCV(knn, params, cv=5, n_jobs=-1)
clf.fit(X_train_sc, y_train)    # Scaling isn't need for KNN

In [None]:
# Fitting result
clf.cv_results_

> Best parameter=41

In [None]:
# Evaluation with best parameter setting
knn = KNeighborsClassifier(n_neighbors=41)
knn.fit(X_train_sc, y_train)
knn.score(X_test_sc, y_test)

In [None]:
print(classification_report(y_test, knn.predict(X_test_sc), zero_division=0))

> **Accuracy: 49.8%, f1-score(weighted): 0.49**

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5)
dt.fit(X_train, y_train)    # doesn't need to standardize
dt.score(X_test, y_test)

In [None]:
# Plot tree
plt.figure(figsize=(50, 60))
plot_tree(dt, feature_names=['weight', 'age', 'height'], class_names=True)

In [None]:
# Evaluation
print(classification_report(y_test, dt.predict(X_test), zero_division=0))

> **Accuracy: 50%, f1-score(weighted): 0.49**

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_train_sc, y_train)
nb.score(X_test_sc, y_test)

In [None]:
print(classification_report(y_test, nb.predict(X_test_sc)))

> **Accuracy: 47.9%, f1-score(weighted): 0.47**

## Ensemble Model 1)  Voting

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# All classification models used above
logi = LogisticRegression(solver='saga')
svm = SVC(kernel='rbf', C=0.05, gamma=5, probability=True)
knn = KNeighborsClassifier(n_neighbors=41)
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5)
nb = GaussianNB()

vot = VotingClassifier(estimators=[('logi', logi), ('svm', svm), ('knn', knn), ('dt', dt), ('nb', nb)], voting='soft')

In [None]:
vot.fit(X_train_sc, y_train)
vot.score(X_test_sc, y_test)

## Ensemble Model 2) Bagging - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, n_jobs=-1)
rf.fit(X_train, y_train)    # Decision tree doesn't need scaling
rf.score(X_test, y_test)

In [None]:
# Attributes of random forest classifier
print('Estimators:', rf.estimators_)
print('Base estimator:', rf.base_estimator_)
print('Features:', rf.n_features_)
print('Outputs:', rf.n_outputs_)
print('Feature importances:', rf.feature_importances_)  # The higher, the more important the feature. 

In [None]:
# Grid search
rf = RandomForestClassifier()
params = {'max_depth':[5, 7, 9, 11], 'min_samples_split':[10, 20, 30, 50], 'min_samples_leaf':[5, 10, 15]}
clf = GridSearchCV(rf, params, cv=5, n_jobs=-1)

In [None]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
clf.best_params_

In [None]:
print(classification_report(y_test, clf.predict(X_test), zero_division=0))

> **Accuracy: 50.5%, f1-score(weighted): 0.49**

## Ensemble Model 3) Boosting - XGboost

In [None]:
# Install Xgboost library
!pip install C:\Users\SK\anaconda3\Lib\site-packages\xgboost-1.4.2-cp38-cp38-win_amd64.whl

In [None]:
from xgboost import XGBClassifier

In [None]:
# Prepare validation set from train set
X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(X_train, y_train, test_size=0.3, random_state=5)
X_train_xgb.shape, y_val_xgb.shape    # Standardization isn't needed

In [None]:
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, n_jobs=16, max_depth=10, objective='multi:softmax')
xgb.fit(X_train_xgb, y_train_xgb, eval_set=[(X_val_xgb, y_val_xgb)], eval_metric='mlogloss', early_stopping_rounds=100)

In [None]:
# Predict with test data
y_pred = xgb.predict(X_test)
xgb.score(X_test, y_test)

In [None]:
# Evaluation
import warnings
warnings.filterwarnings('ignore')   # To stop getting warning messages

print(classification_report(y_test, y_pred))

### Grid Search 1

In [None]:
xgb = XGBClassifier(n_jobs=16, objective='multi:softmax')
params = {'n_estimators':[500], 'learning_rate':[0.001, 0.01], 'max_depth':[5, 15]}

clf = GridSearchCV(xgb, params, cv=5, n_jobs=-1, scoring='accuracy')
clf.fit(X_train_xgb, y_train_xgb, eval_set=[(X_val_xgb, y_val_xgb)], eval_metric='mlogloss', early_stopping_rounds=100)

In [None]:
print(clf.best_params_)
print(clf.best_estimator_)

In [None]:
# Evaluation
clf.score(X_test, y_test)

### Grid Search 2

In [None]:
# Grid search 2
xgb = XGBClassifier(n_jobs=16, objective='multi:softmax')
params = {'n_estimators':[600], 'learning_rate':[0.001, 0.01], 'max_depth':[5, 10]}

clf = GridSearchCV(xgb, params, cv=5, n_jobs=-1, scoring='accuracy')
clf.fit(X_train_xgb, y_train_xgb, eval_set=[(X_val_xgb, y_val_xgb)], eval_metric='mlogloss', early_stopping_rounds=100)

In [None]:
clf.best_params_

In [None]:
clf.score(X_test, y_test)

## Linear Discriminant Analysis (LDA)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
lda = LDA(n_components=2)
lda.fit(X_train_sc, y_train)
lda.score(X_test_sc, y_test)

In [None]:
# For Tableau visualization
X_test_comp = lda.transform(X_test_sc)
X_test_comp    # LDA component1, component2

In [None]:
lda_df = pd.DataFrame(X_test_comp, columns=['Component 1', 'Component 2'])
lda_df['Prediction'] = lda.predict(X_test_sc)   # Add a column of target
lda_df

In [None]:
# Export to csv for Tableau
lda_df.to_csv('../data/data4_age_included.csv')
print('Export completed-------------------------------')

### Logistic Regression after LDA

In [None]:
# Transform X by LDA
X_train_comp = lda.transform(X_train_sc)

# Logistic Regression
logi = LogisticRegression()
logi.fit(X_train_comp, y_train)
logi.score(X_test_comp, y_test)   # Not much improved

## Quadratic Discriminant Analysis (QDA)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [None]:
qda = QDA()
qda.fit(X_train_sc, y_train)
qda.score(X_test_sc,y_test)    # Not a big difference with LDA

## Conclusion

**1. Logistic Regression**: Accuracy: 50.4%, f1 score(weighted): 0.49<br>
**2. Support Vector Machine**: Accuracy: 50.6%<br>
**3. K-Nearest Neighbors**: Accuracy: 49.8%, f1-score(weighted): 0.49<br>
**4. Decision Tree**: Accuracy: 50%, f1-score(weighted): 0.49<br>
**5. Naive Bayes**: Accuracy: 47.9%, f1-score(weighted): 0.47<br>
**6. Ensemble Model 1) Voting**: Accuracy: 50.5%<br>
**7. Ensemble Model 2) Bagging - Random Forest**: Accuracy: 50.5%, f1-score(weighted): 0.49<br>
**8. Ensemble Model 3) Boosting - XGboost**: Accuracy: 50.6%<br>
**9. Linear Discriminant Analysis**: Accuracy: 49.9%<br>
**10. Quadratic Discriminant Analysis**: Accuracy: 50.4%<br>