In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [3]:
import os

# Construct the relative path
relative_path = os.path.join("data/processed", "sales_features.csv")

# Read the processed file to a dataframe
retail_df = pd.read_csv(relative_path)

# Display the first five records 
retail_df.head()


Unnamed: 0,Customer_ID,age_range,HolidayName,Month_name,jan_dummy,feb_dummy,mar_dummy,apr_dummy,may_dummy,jun_dummy,...,dummy_40_49,dummy_50_59,dummy_60_plus,gender_dummy,HolidayDummy,Max_Temp,Min_Temp,Mean_Temp,Total_Precip_mm,Product_Category
0,CUST015,40-49,0,1,1,0,0,0,0,0,...,1,0,0,0,0,1.8,-5.1,-1.7,0.1,Electronics
1,CUST065,50-59,0,12,0,0,0,0,0,0,...,0,1,0,1,0,3.1,0.3,1.7,0.1,Electronics
2,CUST072,20-29,0,5,0,0,0,0,1,0,...,0,0,0,0,0,19.5,10.0,14.8,0.0,Electronics
3,CUST074,Under 20,0,11,0,0,0,0,0,0,...,0,0,0,0,0,7.1,3.0,5.1,2.8,Beauty
4,CUST089,50-59,0,10,0,0,0,0,0,0,...,0,1,0,0,0,24.8,14.9,19.9,0.0,Electronics


In [4]:
# Describe the dataset
retail_df.describe()

Unnamed: 0,Month_name,jan_dummy,feb_dummy,mar_dummy,apr_dummy,may_dummy,jun_dummy,jul_dummy,aug_dummy,sep_dummy,...,dummy_30_39,dummy_40_49,dummy_50_59,dummy_60_plus,gender_dummy,HolidayDummy,Max_Temp,Min_Temp,Mean_Temp,Total_Precip_mm
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,994.0,994.0,994.0,994.0
mean,6.549,0.078,0.085,0.073,0.086,0.105,0.077,0.072,0.094,0.065,...,0.191,0.222,0.221,0.115,0.49,0.03,14.376157,7.091046,10.735815,2.107143
std,3.452755,0.268306,0.279021,0.260267,0.280504,0.306707,0.266725,0.258617,0.291975,0.246649,...,0.393286,0.415799,0.415128,0.319182,0.50015,0.170673,9.702371,8.3992,8.934392,4.832535
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-9.3,-20.5,-14.9,0.0
25%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.1,0.3,3.2,0.0
50%,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.9,6.6,9.6,0.0
75%,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,24.0,15.0,19.6,1.7
max,12.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,32.9,22.4,26.9,50.1


In [7]:
# Info about the fields
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Customer_ID       1000 non-null   object 
 1   age_range         1000 non-null   object 
 2   HolidayName       1000 non-null   object 
 3   Month_name        1000 non-null   int64  
 4   jan_dummy         1000 non-null   int64  
 5   feb_dummy         1000 non-null   int64  
 6   mar_dummy         1000 non-null   int64  
 7   apr_dummy         1000 non-null   int64  
 8   may_dummy         1000 non-null   int64  
 9   jun_dummy         1000 non-null   int64  
 10  jul_dummy         1000 non-null   int64  
 11  aug_dummy         1000 non-null   int64  
 12  sep_dummy         1000 non-null   int64  
 13  oct_dummy         1000 non-null   int64  
 14  nov_dummy         1000 non-null   int64  
 15  dec_dummy         1000 non-null   int64  
 16  spend_3m          1000 non-null   int64  
 

In [8]:
# Shape 
retail_df.shape

(1000, 33)

In [10]:
# Define the feature variables
features = retail_df.drop(['Customer_ID','age_range', 'HolidayName', 'Month_name',
'Max_Temp', 'Min_Temp', 'Mean_Temp', 'Total_Precip_mm', 'spend_3m','QTY_3m','QTY_12m'], axis=1).iloc[:, :-1]
features

Unnamed: 0,jan_dummy,feb_dummy,mar_dummy,apr_dummy,may_dummy,jun_dummy,jul_dummy,aug_dummy,sep_dummy,oct_dummy,...,dec_dummy,spend_12m,dummy_under_20,dummy_20_29,dummy_30_39,dummy_40_49,dummy_50_59,dummy_60_plus,gender_dummy,HolidayDummy
0,1,0,0,0,0,0,0,0,0,0,...,0,2000,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,2000,0,0,0,0,1,0,1,0
2,0,0,0,0,1,0,0,0,0,0,...,0,2000,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,2000,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,2000,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,25,0,0,0,0,1,0,0,0
996,0,0,0,0,0,0,1,0,0,0,...,0,25,0,0,0,0,1,0,1,0
997,0,0,0,1,0,0,0,0,0,0,...,0,25,0,0,0,0,0,1,1,0
998,1,0,0,0,0,0,0,0,0,0,...,0,25,1,0,0,0,0,0,0,0


In [17]:
# Standardize the spend 
scaler = StandardScaler()
features[[ 'spend_12m']] = scaler.fit_transform(features[['spend_12m']])
features

Unnamed: 0,jan_dummy,feb_dummy,mar_dummy,apr_dummy,may_dummy,jun_dummy,jul_dummy,aug_dummy,sep_dummy,oct_dummy,...,dec_dummy,spend_12m,dummy_under_20,dummy_20_29,dummy_30_39,dummy_40_49,dummy_50_59,dummy_60_plus,gender_dummy,HolidayDummy
0,1,0,0,0,0,0,0,0,0,0,...,0,2.758534,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,2.758534,0,0,0,0,1,0,1,0
2,0,0,0,0,1,0,0,0,0,0,...,0,2.758534,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,2.758534,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,2.758534,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,-0.770031,0,0,0,0,1,0,0,0
996,0,0,0,0,0,0,1,0,0,0,...,0,-0.770031,0,0,0,0,1,0,1,0
997,0,0,0,1,0,0,0,0,0,0,...,0,-0.770031,0,0,0,0,0,1,1,0
998,1,0,0,0,0,0,0,0,0,0,...,0,-0.770031,1,0,0,0,0,0,0,0


In [None]:
# Define the target variable
target = retail_df[["Product_Category"]]
target

Unnamed: 0,Product_Category
0,Electronics
1,Electronics
2,Electronics
3,Beauty
4,Electronics
...,...
995,Clothing
996,Clothing
997,Beauty
998,Beauty


In [24]:
from sklearn.model_selection import train_test_split

# Divide into test train in the ratio of 4:1
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2, # 20% of the data is used for testing
    random_state=42 # Providing a value here means getting the same "random" split every time
)

In [25]:
X_train.shape

(800, 21)

In [26]:
y_train

Unnamed: 0,Product_Category
29,Clothing
535,Electronics
695,Beauty
557,Beauty
836,Beauty
...,...
106,Clothing
270,Electronics
860,Electronics
435,Beauty


In [27]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Convert target variable to integers 0,1,2 and then convert them to one-hot encoded arrays
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
print(f'Before one-hot encoding: {y_train[799]}')
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)
print(f'After one-hot encoding: {y_train[799]}')




Before one-hot encoding: 0
After one-hot encoding: [1. 0. 0.]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

# Input layer
model.add(Dense(64, activation='relu', input_shape=(21,))) # 64 neurons, ReLU activation, input shape of 21

# Hidden layer
model.add(Dense(64, activation='relu')) # 64 neurons, ReLU activation

# Output layer
model.add(Dense(3, activation='softmax')) # 3 neurons, softmax activation

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                1408      
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 3)                 195       
                                                                 
Total params: 5,763
Trainable params: 5,763
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(
    loss='categorical_crossentropy', # Loss function
    optimizer='sgd', # Optimizer
    metrics=['accuracy'] # Metrics to evaluate the model
)

In [31]:
history = model.fit(
    X_train, # Training data
    y_train, # Training labels
    epochs=10, # Number of epochs
    batch_size=32, # Number of samples per batch
    validation_split=0.2 # Use 20% of the data for validation
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
