In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Read the CSV file into a Pandas DataFrame
df_walmart= pd.read_csv("Resources/walmart.csv")

# Print the DataFrame lenght
print(len(df_walmart))

# Review the DataFrame
df_walmart.head()

550068


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,7969


In [3]:
# Drop the User_ID / Product_ID columns from our DataFrame

df_walmart = df_walmart.drop(columns=["User_ID", "Product_ID"])

# Review the new DataFrame
df_walmart.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category,Purchase
0,F,0-17,10,A,2,0,3,8370
1,F,0-17,10,A,2,0,1,15200
2,F,0-17,10,A,2,0,12,1422
3,F,0-17,10,A,2,0,12,1057
4,M,55+,16,C,4+,0,8,7969


In [4]:
# Review how many values there are in each column
df_walmart.count()

Gender                        550068
Age                           550068
Occupation                    550068
City_Category                 550068
Stay_In_Current_City_Years    550068
Marital_Status                550068
Product_Category              550068
Purchase                      550068
dtype: int64

In [5]:
# Search for null values to drop from the DataFrame
df_walmart.isnull().sum()

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category              0
Purchase                      0
dtype: int64

In [6]:
df_walmart.dtypes

Gender                        object
Age                           object
Occupation                     int64
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                 int64
Product_Category               int64
Purchase                       int64
dtype: object

In [7]:
print("Gender values are: ")
print(df_walmart["Gender"].unique())
print("-"*80)

print("Age values are: ")
print(df_walmart["Age"].unique())
print("-"*80)

print("Occupation values are: ")
print(df_walmart["Occupation"].unique())
print("-"*80)

print("City Category values are: ")
print(df_walmart["City_Category"].unique())
print("-"*80)

print("Stay in City Years values are: ")
print(df_walmart["Stay_In_Current_City_Years"].unique())
print("-"*80)

print("Marital Status values are: ")
print(df_walmart["Marital_Status"].unique())
print("-"*80)

print("Product Category values are: ")
print(df_walmart["Product_Category"].unique())
print("-"*80)

Gender values are: 
['F' 'M']
--------------------------------------------------------------------------------
Age values are: 
['0-17' '55+' '26-35' '46-50' '51-55' '36-45' '18-25']
--------------------------------------------------------------------------------
Occupation values are: 
[10 16 15  7 20  9  1 12 17  0  3  4 11  8 19  2 18  5 14 13  6]
--------------------------------------------------------------------------------
City Category values are: 
['A' 'C' 'B']
--------------------------------------------------------------------------------
Stay in City Years values are: 
['2' '4+' '3' '1' '0']
--------------------------------------------------------------------------------
Marital Status values are: 
[0 1]
--------------------------------------------------------------------------------
Product Category values are: 
[ 3  1 12  8  5  4  2  6 14 11 13 15  7 16 18 10 17  9 20 19]
--------------------------------------------------------------------------------


In [8]:
df_walmart["Gender"] = df_walmart["Gender"].map({"F": 0, "M": 1})
df_walmart = pd.get_dummies(df_walmart, columns=["Age", "City_Category"], drop_first=True)
df_walmart["Stay_In_Current_City_Years"] = df_walmart["Stay_In_Current_City_Years"].replace({"4+":4}).astype(int)

In [9]:
df_walmart.head()

Unnamed: 0,Gender,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category,Purchase,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C
0,0,10,2,0,3,8370,False,False,False,False,False,False,False,False
1,0,10,2,0,1,15200,False,False,False,False,False,False,False,False
2,0,10,2,0,12,1422,False,False,False,False,False,False,False,False
3,0,10,2,0,12,1057,False,False,False,False,False,False,False,False
4,1,16,4,0,8,7969,False,False,False,False,False,True,False,True


In [10]:
# Define the variables
y = df_walmart["Product_Category"]
X = df_walmart.drop(columns=["Product_Category"])


In [11]:
# Preview the DataFrame
X[:5]

Unnamed: 0,Gender,Occupation,Stay_In_Current_City_Years,Marital_Status,Purchase,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C
0,0,10,2,0,8370,False,False,False,False,False,False,False,False
1,0,10,2,0,15200,False,False,False,False,False,False,False,False
2,0,10,2,0,1422,False,False,False,False,False,False,False,False
3,0,10,2,0,1057,False,False,False,False,False,False,False,False
4,1,16,4,0,7969,False,False,False,False,False,True,False,True


In [12]:
# Preview the first five entries for the target variable
y[:5]

0     3
1     1
2    12
3    12
4     8
Name: Product_Category, dtype: int64

In [13]:
# Split the data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [14]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 4 
knn = KNeighborsClassifier(n_neighbors=4)

In [16]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [17]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [18]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.82      0.91      0.86     28076
           2       0.53      0.49      0.51      4773
           3       0.74      0.56      0.63      4043
           4       0.53      0.62      0.57      2351
           5       0.91      0.97      0.94     30187
           6       0.51      0.48      0.50      4093
           7       0.28      0.13      0.18       744
           8       0.78      0.84      0.81     22785
           9       0.00      0.00      0.00        82
          10       0.88      0.38      0.53      1025
          11       0.71      0.51      0.60      4857
          12       0.53      0.45      0.49       789
          13       0.60      0.71      0.65      1110
          14       0.56      0.03      0.06       304
          15       0.47      0.19      0.27      1258
          16       0.29      0.13      0.18      1966
          17       0.33      0.02      0.03       115
          18       0.53    

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7884087479775301
