# Importing Packages and Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# importing data
df = pd.read_csv('products.csv')

# adding new column for bin category
df["bin"] = pd.qcut(df['price'], q=5)

# copy original data to 'newdf'
newdf = df.copy()
newdf

Unnamed: 0,product_ID,product_type,product_name,size,colour,price,quantity,description,bin
0,0,Shirt,Oxford Cloth,XS,red,114,66,"A red coloured, XS sized, Oxford Cloth Shirt","(112.0, 115.0]"
1,1,Shirt,Oxford Cloth,S,red,114,53,"A red coloured, S sized, Oxford Cloth Shirt","(112.0, 115.0]"
2,2,Shirt,Oxford Cloth,M,red,114,54,"A red coloured, M sized, Oxford Cloth Shirt","(112.0, 115.0]"
3,3,Shirt,Oxford Cloth,L,red,114,69,"A red coloured, L sized, Oxford Cloth Shirt","(112.0, 115.0]"
4,4,Shirt,Oxford Cloth,XL,red,114,47,"A red coloured, XL sized, Oxford Cloth Shirt","(112.0, 115.0]"
...,...,...,...,...,...,...,...,...,...
1255,1255,Trousers,Tracksuit Bottoms,XS,violet,91,67,"A violet coloured, XS sized, Tracksuit Bottoms...","(89.999, 95.0]"
1256,1256,Trousers,Tracksuit Bottoms,S,violet,91,48,"A violet coloured, S sized, Tracksuit Bottoms ...","(89.999, 95.0]"
1257,1257,Trousers,Tracksuit Bottoms,M,violet,91,73,"A violet coloured, M sized, Tracksuit Bottoms ...","(89.999, 95.0]"
1258,1258,Trousers,Tracksuit Bottoms,L,violet,91,45,"A violet coloured, L sized, Tracksuit Bottoms ...","(89.999, 95.0]"


In [2]:
# check on the composition of categorical variables
df[['product_type','product_name','size','colour']].describe()

Unnamed: 0,product_type,product_name,size,colour
count,1260,1260,1260,1260
unique,3,35,5,7
top,Shirt,Denim,XS,red
freq,420,70,252,180


In [5]:
# label-encoding on variables: product type, product_name, size and colour
label_encoder = LabelEncoder()

def label_data(df, var, var_new, dtype):
    df[var_new] = label_encoder.fit_transform(df[var])
    df[var_new] = df[var_new].astype(dtype)
    #return df[var_new]


label_data(newdf, 'product_type', 'product_type_cat', 'category')
label_data(newdf, 'product_name', 'product_name_cat', 'category')
label_data(newdf, 'size', 'size_cat', 'category')
label_data(newdf, 'colour', 'colour_cat','category')

In [6]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   product_ID        1260 non-null   int64   
 1   product_type      1260 non-null   object  
 2   product_name      1260 non-null   object  
 3   size              1260 non-null   object  
 4   colour            1260 non-null   object  
 5   price             1260 non-null   int64   
 6   quantity          1260 non-null   int64   
 7   description       1260 non-null   object  
 8   bin               1260 non-null   category
 9   product_type_cat  1260 non-null   category
 10  product_name_cat  1260 non-null   category
 11  size_cat          1260 non-null   category
 12  colour_cat        1260 non-null   category
dtypes: category(5), int64(3), object(5)
memory usage: 87.3+ KB


In [7]:
# Obtain pairs for labeled variables

def paired_label(df, var1, var2):
    var1_pairset = set(list(zip(df[var1],df[var2])))
    print(f'{var1}\'s pair: {var1_pairset}')
    return var1_pairset

product_type_pair = paired_label(newdf, 'product_type', 'product_type_cat')
product_name_pair = paired_label(newdf, 'product_name', 'product_name_cat')
size_pair = paired_label(newdf, 'size', 'size_cat')
colour_pair = paired_label(newdf, 'colour', 'colour_cat')

product_type's pair: {('Shirt', 1), ('Trousers', 2), ('Jacket', 0)}
product_name's pair: {('Wool', 34), ('Pullover', 27), ('Trench Coat', 32), ('Cuban Collar', 10), ('Flannel', 14), ('High-Waisted', 16), ('Linen', 19), ('Puffer', 26), ('Cropped', 9), ('Oxford Cloth', 21), ('Pleated', 24), ('Polo', 25), ('Dress', 13), ('Cardigan', 2), ('Cords', 8), ('Camp Collared', 1), ('Leather', 18), ('Joggers', 17), ('Tracksuit Bottoms', 31), ('Henley', 15), ('Coach', 7), ('Peacoat', 23), ('Bomber', 0), ('Drawstring', 12), ('Shearling', 29), ('Relaxed Leg', 28), ('Chambray', 5), ('Casual Slim Fit', 4), ('Slim-Fit', 30), ('Denim', 11), ('Windbreaker', 33), ('Chinos', 6), ('Parka', 22), ('Cargo Pants', 3), ('Mandarin Collar', 20)}
size's pair: {('S', 2), ('XL', 3), ('M', 1), ('XS', 4), ('L', 0)}
colour's pair: {('red', 4), ('green', 1), ('indigo', 2), ('violet', 5), ('yellow', 6), ('blue', 0), ('orange', 3)}


In [8]:
newdf.head()

Unnamed: 0,product_ID,product_type,product_name,size,colour,price,quantity,description,bin,product_type_cat,product_name_cat,size_cat,colour_cat
0,0,Shirt,Oxford Cloth,XS,red,114,66,"A red coloured, XS sized, Oxford Cloth Shirt","(112.0, 115.0]",1,21,4,4
1,1,Shirt,Oxford Cloth,S,red,114,53,"A red coloured, S sized, Oxford Cloth Shirt","(112.0, 115.0]",1,21,2,4
2,2,Shirt,Oxford Cloth,M,red,114,54,"A red coloured, M sized, Oxford Cloth Shirt","(112.0, 115.0]",1,21,1,4
3,3,Shirt,Oxford Cloth,L,red,114,69,"A red coloured, L sized, Oxford Cloth Shirt","(112.0, 115.0]",1,21,0,4
4,4,Shirt,Oxford Cloth,XL,red,114,47,"A red coloured, XL sized, Oxford Cloth Shirt","(112.0, 115.0]",1,21,3,4


In [9]:
# splitting data by 8 over 2
train, test = train_test_split(newdf, test_size=0.20)
x_train, y_train = train.drop(["price", "bin", "description", "product_ID", "product_type", "product_name", "size", "colour"], axis = 1), train[['bin']]
x_test, y_test = test.drop(["price", "bin", "description", "product_ID", "product_type", "product_name", "size", "colour"], axis = 1), test[['bin']]

In [10]:
x_train

Unnamed: 0,quantity,product_type_cat,product_name_cat,size_cat,colour_cat
175,77,1,1,4,4
263,68,1,25,0,1
1250,63,2,31,4,2
942,73,2,12,1,5
466,48,0,26,2,6
...,...,...,...,...,...
971,41,2,30,2,2
722,46,0,32,1,0
1132,72,2,3,1,6
381,43,1,15,2,5


In [11]:
y_train

Unnamed: 0,bin
175,"(104.0, 112.0]"
263,"(115.0, 119.0]"
1250,"(89.999, 95.0]"
942,"(95.0, 104.0]"
466,"(104.0, 112.0]"
...,...
971,"(115.0, 119.0]"
722,"(104.0, 112.0]"
1132,"(104.0, 112.0]"
381,"(89.999, 95.0]"


#  Naïve Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

#gnb = GaussianNB()
#y_pred = gnb.fit(x_train, y_train).predict(x_test)
#print(classification_report(y_test, y_pred))
#print(classification_report(y_test, NB.predict(scaler.transform(X_test))))

# K-Nearest Neighbors Classifier

In [None]:
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as pl

scores = {}
scores_list = []
  
for k in range(1, 29):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    scores[k] = metrics.accuracy_score(y_test, y_pred)
    scores_list.append(metrics.accuracy_score(y_test, y_pred))
pl.plot(range(1, 29), scores_list)