# Import required Libraries

In [39]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Dataset: Data Acquisition
Source of the dataset - https://www.kaggle.com/datasets/prasad22/retail-transactions-dataset

In [40]:
# Load/Read the dataset
df = pd.read_csv('Retail_Transactions_Dataset.csv')
df

Unnamed: 0,Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Payment_Method,City,Store_Type,Discount_Applied,Customer_Category,Season,Promotion
0,1000000000,2022-01-21 06:27:29,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,
1,1000000001,2023-03-01 13:01:21,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'P...",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One)
2,1000000002,2024-03-21 15:37:04,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,
3,1000000003,2020-10-31 09:59:47,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,
4,1000000004,2020-12-10 00:59:59,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,1000999995,2023-03-27 06:12:10,Lisa Gonzalez,"['Pickles', 'Carrots', 'Peanut Butter', 'Spong...",1,22.07,Debit Card,Los Angeles,Supermarket,False,Middle-Aged,Winter,
999996,1000999996,2022-05-19 05:13:58,Emily Graham,['Cereal'],8,80.25,Cash,Houston,Warehouse Club,True,Senior Citizen,Spring,Discount on Selected Items
999997,1000999997,2021-09-03 13:59:39,Cynthia Anderson,['Trash Bags'],3,60.74,Credit Card,Los Angeles,Convenience Store,False,Homemaker,Winter,
999998,1000999998,2023-10-17 05:50:40,Michael Rodriguez,"['Diapers', 'Coffee', 'Coffee', 'Mop']",3,23.48,Debit Card,San Francisco,Supermarket,True,Retiree,Winter,BOGO (Buy One Get One)


In [41]:
df['Date'] = pd.to_datetime(df['Date'])
df['Date'].head()

0   2022-01-21 06:27:29
1   2023-03-01 13:01:21
2   2024-03-21 15:37:04
3   2020-10-31 09:59:47
4   2020-12-10 00:59:59
Name: Date, dtype: datetime64[ns]

In [42]:
# info() helps summarize the dataset- It gives basic information like number of non-null values, datatypes and memory usage
# It is a good practise to start by this information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   Transaction_ID     1000000 non-null  int64         
 1   Date               1000000 non-null  datetime64[ns]
 2   Customer_Name      1000000 non-null  object        
 3   Product            1000000 non-null  object        
 4   Total_Items        1000000 non-null  int64         
 5   Total_Cost         1000000 non-null  float64       
 6   Payment_Method     1000000 non-null  object        
 7   City               1000000 non-null  object        
 8   Store_Type         1000000 non-null  object        
 9   Discount_Applied   1000000 non-null  bool          
 10  Customer_Category  1000000 non-null  object        
 11  Season             1000000 non-null  object        
 12  Promotion          1000000 non-null  object        
dtypes: bool(1), datetime64[ns](1

In [43]:
# Changing total_cost float to integar
df['Total_Cost'] = df['Total_Cost'].astype(int)
print(df['Total_Cost'].dtype)

int64


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   Transaction_ID     1000000 non-null  int64         
 1   Date               1000000 non-null  datetime64[ns]
 2   Customer_Name      1000000 non-null  object        
 3   Product            1000000 non-null  object        
 4   Total_Items        1000000 non-null  int64         
 5   Total_Cost         1000000 non-null  int64         
 6   Payment_Method     1000000 non-null  object        
 7   City               1000000 non-null  object        
 8   Store_Type         1000000 non-null  object        
 9   Discount_Applied   1000000 non-null  bool          
 10  Customer_Category  1000000 non-null  object        
 11  Season             1000000 non-null  object        
 12  Promotion          1000000 non-null  object        
dtypes: bool(1), datetime64[ns](1

In [45]:
# Missing value 
df.isnull().sum()

Transaction_ID       0
Date                 0
Customer_Name        0
Product              0
Total_Items          0
Total_Cost           0
Payment_Method       0
City                 0
Store_Type           0
Discount_Applied     0
Customer_Category    0
Season               0
Promotion            0
dtype: int64

In [46]:
df.shape

(1000000, 13)

In [47]:
#It gives the numerical statistical information of the dataframe
"""
count - The number of non-empty values.
mean - The average value
std - The standard deviation
min - the minimum value
25% - The 25% percentile*
50% - The 50% percentile*
75% - The 75% percentile*
max - the maximum value """

df.describe()

Unnamed: 0,Transaction_ID,Total_Items,Total_Cost
count,1000000.0,1000000.0,1000000.0
mean,1000500000.0,5.495941,51.960122
std,288675.3,2.871654,27.415688
min,1000000000.0,1.0,5.0
25%,1000250000.0,3.0,28.0
50%,1000500000.0,5.0,52.0
75%,1000750000.0,8.0,76.0
max,1001000000.0,10.0,100.0


# Feature Selection

In [48]:
# Applying one-Hot Encoding 
df = pd.get_dummies(df, columns=['Payment_Method', 'City', 'Store_Type', 'Customer_Category', 'Season', 'Promotion' , 'Discount_Applied'], drop_first=True)
df.head(10)

Unnamed: 0,Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Payment_Method_Credit Card,Payment_Method_Debit Card,Payment_Method_Mobile Payment,City_Boston,...,Customer_Category_Senior Citizen,Customer_Category_Student,Customer_Category_Teenager,Customer_Category_Young Adult,Season_Spring,Season_Summer,Season_Winter,Promotion_Discount on Selected Items,Promotion_None,Discount_Applied_True
0,1000000000,2022-01-21 06:27:29,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1
1,1000000001,2023-03-01 13:01:21,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'P...",2,25,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1000000002,2024-03-21 15:37:04,Lisa Graves,['Spinach'],6,41,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
3,1000000003,2020-10-31 09:59:47,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39,0,0,1,0,...,0,0,0,0,1,0,0,0,1,1
4,1000000004,2020-12-10 00:59:59,Susan Mitchell,['Dish Soap'],10,16,0,1,0,0,...,0,0,0,1,0,0,1,1,0,0
5,1000000005,2021-10-07 12:37:26,Joshua Frazier,"['Toothpaste', 'Chicken']",3,72,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
6,1000000006,2023-01-08 10:40:03,Victoria Garrett,"['Honey', 'BBQ Sauce', 'Soda', 'Olive Oil', 'G...",4,5,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
7,1000000007,2020-09-03 12:39:59,Sydney Waller,"['Syrup', 'Trash Cans', 'Pancake Mix', 'Water'...",5,21,0,1,0,0,...,0,0,0,1,0,0,1,1,0,0
8,1000000008,2021-04-05 06:32:18,Kimberly Morgan,['Insect Repellent'],4,55,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
9,1000000009,2021-07-08 10:08:59,Lori Conway,"['Soap', 'Baby Wipes', 'Soda']",7,31,0,0,1,1,...,0,0,0,1,0,0,1,0,1,1


In [50]:
# Step 2: Data Preprocessing
X = df[['Total_Items', 'Total_Cost']]
y = df['Customer_Category_Young Adult']  # Target

In [51]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Standardize the features (KNN performs better with normalized data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [53]:
# Initialize the KNN model with k=4
knn_model = KNeighborsClassifier(n_neighbors=4)

In [54]:
# Train the model
knn_model.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=4)

In [55]:
# Make predictions
y_pred = knn_model.predict(X_test_scaled)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [56]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.872665
Confusion Matrix:
[[174429    699]
 [ 24768    104]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93    175128
           1       0.13      0.00      0.01     24872

    accuracy                           0.87    200000
   macro avg       0.50      0.50      0.47    200000
weighted avg       0.78      0.87      0.82    200000



# Considering confusion matrix above:

True negative = 174429

False positive = 699]

True postive = 24768

Fasle negative = 104]