**Training and texting**

In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv("https://raw.githubusercontent.com/futurexskill/projects/main/knn-classification/purchase_history.csv")


df.head()

Unnamed: 0,Customer ID,Gender,Age,Salary,Product ID,Price,Purchased
0,1,Female,49,61000,P01,2000,1
1,2,Male,36,30000,P02,2000,1
2,3,Female,26,81000,P02,3500,1
3,4,Female,32,74000,P01,7000,0
4,5,Female,42,56000,P01,5000,0


In [None]:
df.count()

Customer ID    1000
Gender         1000
Age            1000
Salary         1000
Product ID     1000
Price          1000
Purchased      1000
dtype: int64

In [None]:
len(df)

1000

In [None]:
gender_encoded = pd.get_dummies(df['Gender'])

In [None]:
gender_encoded

Unnamed: 0,Female,Male
0,True,False
1,False,True
2,True,False
3,True,False
4,True,False
...,...,...
995,False,True
996,False,True
997,True,False
998,False,True


In [None]:
gender_encoded = pd.get_dummies(df['Gender'],drop_first=True)

In [None]:
gender_encoded

Unnamed: 0,Male
0,False
1,True
2,False
3,False
4,False
...,...
995,True
996,True
997,False
998,True


In [None]:
df = pd.concat([df,gender_encoded],axis=1)

In [None]:
df

Unnamed: 0,Customer ID,Gender,Age,Salary,Product ID,Price,Purchased,Male
0,1,Female,49,61000,P01,2000,1,False
1,2,Male,36,30000,P02,2000,1,True
2,3,Female,26,81000,P02,3500,1,False
3,4,Female,32,74000,P01,7000,0,False
4,5,Female,42,56000,P01,5000,0,False
...,...,...,...,...,...,...,...,...
995,996,Male,21,73000,P02,5000,1,True
996,997,Male,62,54000,P01,5000,0,True
997,998,Female,41,20000,P03,2000,0,False
998,999,Male,22,29000,P03,5000,0,True


In [None]:
x = df[['Male','Age','Salary','Price']].to_numpy() #excluded customer ID and Product ID because no significant impact

In [None]:
x

array([[False, 49, 61000, 2000],
       [True, 36, 30000, 2000],
       [False, 26, 81000, 3500],
       ...,
       [False, 41, 20000, 2000],
       [True, 22, 29000, 5000],
       [False, 49, 36000, 3500]], dtype=object)

In [None]:
y = df['Purchased'].to_numpy()

In [None]:
y

array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
x_train

array([[True, 26, 28000, 5000],
       [True, 54, 21000, 7000],
       [False, 42, 22000, 5000],
       ...,
       [False, 59, 91000, 2000],
       [False, 31, 82000, 2000],
       [False, 20, 34000, 5000]], dtype=object)

In [None]:
x_test

array([[True, 54, 36000, 5000],
       [False, 54, 60000, 7000],
       [True, 25, 39000, 2000],
       [False, 39, 38000, 7000],
       [True, 28, 37000, 5000],
       [True, 52, 42000, 2000],
       [False, 43, 25000, 7000],
       [False, 40, 22000, 5000],
       [False, 18, 23000, 3500],
       [True, 62, 40000, 5000],
       [True, 53, 66000, 5000],
       [False, 50, 38000, 7000],
       [False, 49, 59000, 7000],
       [False, 56, 97000, 2000],
       [True, 41, 44000, 3500],
       [True, 37, 74000, 2000],
       [False, 51, 23000, 5000],
       [True, 54, 29000, 2000],
       [True, 30, 48000, 3500],
       [True, 56, 87000, 3500],
       [False, 61, 75000, 7000],
       [True, 52, 49000, 7000],
       [False, 38, 100000, 2000],
       [False, 40, 79000, 5000],
       [True, 39, 45000, 2000],
       [False, 61, 79000, 7000],
       [True, 38, 94000, 7000],
       [False, 24, 23000, 7000],
       [True, 59, 62000, 5000],
       [True, 61, 67000, 3500],
       [True, 40, 31000, 

In [None]:
y_train

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,

In [None]:
y_test

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0])

In [None]:
len(x_train)

800

In [None]:
len(x_test)

200

In [None]:
len(y_train)

800

In [None]:
len(y_test)

200

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [None]:
x_train

array([[ 1.08347268, -1.09076887, -1.30749141,  0.30934614],
       [ 1.08347268,  0.97668947, -1.60377078,  1.39955279],
       [-0.92295821,  0.0906359 , -1.56144515,  0.30934614],
       ...,
       [-0.92295821,  1.34587846,  1.35902286, -1.32596384],
       [-0.92295821, -0.72157988,  0.97809225, -1.32596384],
       [-0.92295821, -1.53379565, -1.05353767,  0.30934614]])

In [None]:
x_test

array([[ 0.91370804,  0.84165121, -0.92674628,  0.29135523],
       [-1.0944415 ,  0.84165121,  0.09493083,  1.3460348 ],
       [ 0.91370804, -1.16805526, -0.79903664, -1.29066412],
       [-1.0944415 , -0.19785214, -0.84160652,  1.3460348 ],
       [ 0.91370804, -0.96015459, -0.8841764 ,  0.29135523],
       [ 0.91370804,  0.70305076, -0.671327  , -1.29066412],
       [-1.0944415 ,  0.07934876, -1.39501496,  1.3460348 ],
       [-1.0944415 , -0.12855191, -1.5227246 ,  0.29135523],
       [-1.0944415 , -1.65315683, -1.48015472, -0.49965444],
       [ 0.91370804,  1.396053  , -0.75646676,  0.29135523],
       [ 0.91370804,  0.77235099,  0.35035011,  0.29135523],
       [-1.0944415 ,  0.56445032, -0.84160652,  1.3460348 ],
       [-1.0944415 ,  0.49515009,  0.05236095,  1.3460348 ],
       [-1.0944415 ,  0.98025166,  1.67001638, -1.29066412],
       [ 0.91370804, -0.05925169, -0.58618724, -0.49965444],
       [ 0.91370804, -0.33645258,  0.69090915, -1.29066412],
       [-1.0944415 ,  0.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
k= 5
knn = KNeighborsClassifier(n_neighbors=k)

In [None]:
knn.fit(x_train,y_train)

In [None]:
y_pred = knn.predict(x_test)

In [None]:
y_pred

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0])

In [None]:
y_test

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.765

In [None]:
import pickle

In [None]:
with open('knn_model.pickle','wb') as f:
  pickle.dump(knn,f)

In [None]:
with open('scaler.pickle','wb') as f:
  pickle.dump(scaler,f)

In [None]:
!ls

drive  knn_model.pickle  sample_data  scaler.pickle


 **Predicting**

In [None]:
import pickle
with open('knn_model.pickle','rb') as f:
  knn_new = pickle.load(f)

In [None]:
with open('scaler.pickle','rb') as f:
  scaler_new = pickle.load(f)

In [None]:
import pandas as pd
new_df = pd.read_csv("https://raw.githubusercontent.com/futurexskill/projects/main/knn-classification/new_customers.csv")

In [None]:
new_df

Unnamed: 0,Gender,Age,Salary,Product ID,Price
0,Male,24,38652,3,4446
1,Female,47,64727,1,5188
2,Male,46,54648,1,6193
3,Male,31,20018,3,4779
4,Male,58,77731,2,6855
5,Male,28,55478,1,6153
6,Female,48,54211,2,7588
7,Male,31,60076,3,5238
8,Female,42,29290,1,5418
9,Male,50,21445,1,5598


In [None]:
len(new_df)

10

In [None]:
gender_encoded_new = pd.get_dummies(new_df['Gender'], drop_first=True)

In [None]:
gender_encoded_new

Unnamed: 0,Male
0,True
1,False
2,True
3,True
4,True
5,True
6,False
7,True
8,False
9,True


In [None]:
df_new_2 = pd.concat([new_df,gender_encoded_new],axis=1)

In [None]:
df_new_2

Unnamed: 0,Gender,Age,Salary,Product ID,Price,Male
0,Male,24,38652,3,4446,True
1,Female,47,64727,1,5188,False
2,Male,46,54648,1,6193,True
3,Male,31,20018,3,4779,True
4,Male,58,77731,2,6855,True
5,Male,28,55478,1,6153,True
6,Female,48,54211,2,7588,False
7,Male,31,60076,3,5238,True
8,Female,42,29290,1,5418,False
9,Male,50,21445,1,5598,True


In [None]:
x_new = df_new_2[['Male','Age','Salary','Price']].to_numpy()

In [None]:
x_new

array([[True, 24, 38652, 4446],
       [False, 47, 64727, 5188],
       [True, 46, 54648, 6193],
       [True, 31, 20018, 4779],
       [True, 58, 77731, 6855],
       [True, 28, 55478, 6153],
       [False, 48, 54211, 7588],
       [True, 31, 60076, 5238],
       [False, 42, 29290, 5418],
       [True, 50, 21445, 5598]], dtype=object)

In [None]:
x_new_scale2 = scaler_new.fit_transform(x_new)

In [None]:
x_new_scale2

array([[ 0.65465367, -1.54774434, -0.48875298, -1.42543997],
       [-1.52752523,  0.60971747,  0.93112246, -0.61159228],
       [ 0.65465367,  0.51591478,  0.38228547,  0.49072164],
       [ 0.65465367, -0.89112553, -1.50343981, -1.06019565],
       [ 0.65465367,  1.64154703,  1.63923599,  1.21682294],
       [ 0.65465367, -1.17253359,  0.42748189,  0.44684845],
       [-1.52752523,  0.70352015,  0.35848928,  2.02079916],
       [ 0.65465367, -0.89112553,  0.67785916, -0.55675079],
       [-1.52752523,  0.14070403, -0.99854681, -0.35932143],
       [ 0.65465367,  0.89112553, -1.42573465, -0.16189207]])

In [None]:
y_new_pred = knn_new.predict(x_new_scale2)

In [None]:
y_new_pred

array([0, 1, 1, 0, 0, 1, 0, 0, 0, 0])

In [None]:
df_new_2['will_purchase'] = y_new_pred

In [None]:
df_new_2

Unnamed: 0,Gender,Age,Salary,Product ID,Price,Male,will_purchase
0,Male,24,38652,3,4446,True,0
1,Female,47,64727,1,5188,False,1
2,Male,46,54648,1,6193,True,1
3,Male,31,20018,3,4779,True,0
4,Male,58,77731,2,6855,True,0
5,Male,28,55478,1,6153,True,1
6,Female,48,54211,2,7588,False,0
7,Male,31,60076,3,5238,True,0
8,Female,42,29290,1,5418,False,0
9,Male,50,21445,1,5598,True,0


In [62]:
demand = df_new_2['will_purchase'].value_counts()

In [63]:
print(f"the demand of the selected product is: {demand}")

the demand of the selected product is: will_purchase
0    7
1    3
Name: count, dtype: int64
