# CLOTHES SIZE PREDCITION USING KNN
* The Dataset used is [Clothes-Size-Prediction](https://www.kaggle.com/datasets/tourist55/clothessizeprediction), uploaded by Sarvesh Dubey on Kaggle.

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# IMPORTING LIBRARIES
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [4]:
# DATASET LOADING
df = pd.read_csv('../input/clothessizeprediction/final_test.csv')
df.shape

In [5]:
df.isna().sum()

In [6]:
df.dropna(how='any', inplace=True)
df.shape

In [7]:
df.isna().sum()

In [10]:
sns.pairplot(data=df, hue='size', height=6)
plt.show()


In [9]:
plt.style.use('seaborn')
sns.countplot(x=df['size'])
plt.show()

In [11]:
df.head()

In [12]:
df['size'].value_counts()

In [13]:
# Oulier Removal Using Z-score
dfs = []
sizes = []
for size_type in df['size'].unique():
    print('size type:',size_type)
    sizes.append(size_type)
    ndf = df[['age','height','weight']][df['size'] == size_type]
    zscore = ((ndf - ndf.mean())/ndf.std())
    dfs.append(zscore)

In [14]:
# Removing Outliers
for i in range(len(dfs)):
    print(sizes[i])
    dfs[i]['age'] = dfs[i]['age'][(dfs[i]['age']>-3) & (dfs[i]['age']<3)]
    dfs[i]['height'] = dfs[i]['height'][(dfs[i]['height']>-3) & (dfs[i]['height']<3)]
    dfs[i]['weight'] = dfs[i]['weight'][(dfs[i]['weight']>-3) & (dfs[i]['weight']<3)]

In [15]:
for i in range(len(sizes)):
    dfs[i]['size'] = sizes[i]

In [16]:
new_df = pd.concat(dfs)

In [17]:
new_df.head()

In [18]:
new_df['age'][new_df['age']<-3]

In [19]:
new_df['height'][new_df['height']<-3]

In [20]:
new_df['weight'][new_df['weight']<-3]

In [21]:
plt.style.use('seaborn')
sns.countplot(x=new_df['size'])
plt.show()

In [23]:
sns.pairplot(data=new_df, hue='size', height=6)
plt.show()

In [24]:
new_df['size'].value_counts()

In [25]:
new_df2 = new_df[new_df['size'] != 'XXL'].copy()# Removing XXL Size due to lower count

In [26]:
new_df2.dropna(how='any', inplace=True)

In [27]:
plt.style.use('seaborn')
sns.countplot(x=new_df2['size'])
plt.show()

In [29]:
sns.pairplot(data=new_df2, hue='size', height=6)
plt.show()

In [30]:
new_df2['size'].unique()

In [31]:
size_code = {
    'XL':0,
    'L':1,
    'M':2,
    'S':3,
    'XXS':4,
    'XXXL':5
}
new_df2['size'].replace(size_code, inplace=True)

In [32]:
new_df2['size'].value_counts()

In [33]:
# TRAIN TEST SPLIT
x, y = new_df2.drop('size', axis=1), new_df2['size']

In [34]:
x.shape,y.shape

In [35]:
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.25)

In [36]:
x_train.shape, x_test.shape

In [37]:
y_train.shape, y_test.shape

In [38]:
sns.countplot(x=y_train)
plt.show()

In [39]:
sns.countplot(x=y_test)
plt.show()

In [40]:
# KNN MODEL
KNN_model2 = KNeighborsClassifier(n_neighbors=7, metric='manhattan', weights='distance')
KNN_model2.fit(x_train, y_train)

In [41]:
KNN_model2.score(x_test, y_test)

In [42]:
KNN_model2.score(x_train, y_train)

In [43]:
y_pred = KNN_model2.predict(x_test)

In [44]:
print(classification_report(y_test, y_pred))

In [45]:
y_pred_train = KNN_model2.predict(x_train)

In [46]:
print(classification_report(y_train, y_pred_train))

In [47]:
# Saving it as File
from joblib import dump 
dump(KNN_model2, 'Cloth-size-predictor')

In [48]:
!ls