In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
path = r"C:\Users\Asus\OneDrive\Desktop\Data Analysis Project 2\E-commerce Customer Behavior - Sheet1.csv"

In [3]:
df = pd.read_csv(path)
df = df.set_index("Customer ID")

In [4]:
ml_df = df.copy()

In [5]:
ml_df.head()

Unnamed: 0_level_0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
102,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
103,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
104,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
105,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied


In [6]:
print(ml_df.columns)
print(f"Dataset Size: Total {len(ml_df)} customers")

Index(['Gender', 'Age', 'City', 'Membership Type', 'Total Spend',
       'Items Purchased', 'Average Rating', 'Discount Applied',
       'Days Since Last Purchase', 'Satisfaction Level'],
      dtype='object')
Dataset Size: Total 350 customers


In [7]:
target = 'Satisfaction Level'
ml_df[target].value_counts()

Satisfaction Level
Satisfied      125
Unsatisfied    116
Neutral        107
Name: count, dtype: int64

In [8]:
features = [ 
    'Age',
    'Total Spend',
    'Items Purchased',
    'Average Rating',
    'Days Since Last Purchase',
    'Gender',            #Needs encpding  
    'Membership Type',   #Needs encoding
    'Discount Applied'   #Needs encoding
]

ml_df = ml_df[features + [target]].copy()

print(f"Using {len(features)} features to find satisfaction")

Using 8 features to find satisfaction


## Encoding the Nominal and Ordinal Variables

In [9]:
#For Genders and Discount applied we only have two possible outcomes: Male or Females and Yes or No.
#For these type of 2 outcome Nominal Variables we'll use the: Label Encoding

#Firstly we'll for an array of these 3 columns which we'll encode
categorical_cols = ['Gender', 'Membership Type', 'Discount Applied']

for col in categorical_cols:
    print(f"{col}: {ml_df[col].unique()}")

#Label Encoding
# Gender Encoding -> Male/Female = 0/1
ml_df["Encoded Gender"] = (ml_df['Gender'] == 'Male').astype(int)

# Discount Applied Encoding -> True/False = 1/0
ml_df["Encoded Discount"] = (ml_df['Discount Applied']).astype(int)


# One Hot Encoding 
dummies = pd.get_dummies(ml_df['Membership Type'], prefix='Membership').astype(int)
ml_df = pd.concat([dummies, ml_df], axis = 'columns')


Gender: ['Female' 'Male']
Membership Type: ['Gold' 'Silver' 'Bronze']
Discount Applied: [ True False]


In [10]:
ml_df

Unnamed: 0_level_0,Membership_Bronze,Membership_Gold,Membership_Silver,Age,Total Spend,Items Purchased,Average Rating,Days Since Last Purchase,Gender,Membership Type,Discount Applied,Satisfaction Level,Encoded Gender,Encoded Discount
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
101,0,1,0,29,1120.20,14,4.6,25,Female,Gold,True,Satisfied,0,1
102,0,0,1,34,780.50,11,4.1,18,Male,Silver,False,Neutral,1,0
103,1,0,0,43,510.75,9,3.4,42,Female,Bronze,True,Unsatisfied,0,1
104,0,1,0,30,1480.30,19,4.7,12,Male,Gold,False,Satisfied,1,0
105,0,0,1,27,720.40,13,4.0,55,Male,Silver,True,Unsatisfied,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,0,0,1,32,660.30,10,3.8,42,Male,Silver,True,Unsatisfied,1,1
447,1,0,0,36,470.50,8,3.0,27,Female,Bronze,False,Neutral,0,0
448,0,1,0,30,1190.80,16,4.5,28,Female,Gold,True,Satisfied,0,1
449,0,0,1,34,780.20,11,4.2,21,Male,Silver,False,Neutral,1,0


In [11]:
ml_df.dropna(inplace = True)

In [12]:
#Target Encoding for Ordinal Variable: Satisfaction Level
#Satisfied: 2, Neutral: 1, Unsatisfied: 0

#We'll be doing mapping of each code for each variable, 2 being the highest and 0 being the lowest
ml_map = {'Satisfied': 2, 'Neutral': 1, 'Unsatisfied': 0}
ml_df['Encoded Satisfaction Level'] = ml_df[target].map(ml_map)

print("\nSatisfaction encoding:")
print(ml_df[[target, 'Encoded Satisfaction Level']].drop_duplicates())


Satisfaction encoding:
            Satisfaction Level  Encoded Satisfaction Level
Customer ID                                               
101                  Satisfied                           2
102                    Neutral                           1
103                Unsatisfied                           0


In [13]:
ml_df = ml_df.drop(['Gender', 'Membership Type', 'Discount Applied', target], axis=1)

In [14]:
print(f'Final Dataset shape: {ml_df.shape}')
print(f'Fetaures: {len(ml_df.columns) - 1}')
print(f'Columns: {ml_df.columns}')

Final Dataset shape: (348, 11)
Fetaures: 10
Columns: Index(['Membership_Bronze', 'Membership_Gold', 'Membership_Silver', 'Age',
       'Total Spend', 'Items Purchased', 'Average Rating',
       'Days Since Last Purchase', 'Encoded Gender', 'Encoded Discount',
       'Encoded Satisfaction Level'],
      dtype='object')


In [15]:
#Verifying if all the columns are encoded or not
ml_df.dtypes

Membership_Bronze               int64
Membership_Gold                 int64
Membership_Silver               int64
Age                             int64
Total Spend                   float64
Items Purchased                 int64
Average Rating                float64
Days Since Last Purchase        int64
Encoded Gender                  int64
Encoded Discount                int64
Encoded Satisfaction Level      int64
dtype: object

## Train/Test Split

In [16]:
#Now we'll split our dataset into training data and test data to train and test the model

#Splitting Target variable Y and features X
X = ml_df.drop('Encoded Satisfaction Level', axis = 1)
Y = ml_df['Encoded Satisfaction Level']

In [17]:
print(f'Features: {X.shape}')
print(f'Target: {Y.shape}')

Features: (348, 10)
Target: (348,)


In [29]:
#Splitting the dataset with 80% Training data and 20% testing data
X_train, X_val, y_train, y_val = train_test_split(
    X, Y, 
    test_size = 0.3,      # 20% for testing
    random_state = 42,    # For reproducibility
    stratify = Y          # Keep same satisfaction distribution in both sets
)

In [30]:
print(f"\nTraining set: {X_train.shape[0]} customers")
print(f"Test set: {X_test.shape[0]} customers")

print("\nTrain set satisfaction distribution:")
print(Y_train.value_counts(normalize=True) * 100)

print("\nTest set satisfaction distribution:")
print(Y_test.value_counts(normalize=True) * 100)


Training set: 243 customers
Test set: 105 customers

Train set satisfaction distribution:
Encoded Satisfaction Level
2    35.802469
0    33.333333
1    30.864198
Name: proportion, dtype: float64

Test set satisfaction distribution:
Encoded Satisfaction Level
2    36.190476
0    33.333333
1    30.476190
Name: proportion, dtype: float64


In [49]:
print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

(243, 10) (105, 10)
(243,) (105,)


In [57]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

def get_mae(max_leaves, X_train, X_val, y_train, y_val):
    tree = DecisionTreeRegressor (max_leaf_nodes = max_leaves, random_state = 0)
    tree.fit(X_train, y_train)
    prediction = tree.predict(X_val)
    return accuracy_score(y_val, prediction) * 100

In [64]:
for max_leaves in [10, 50, 100, 500, 5000]:
    acc = get_mae(max_leaves, X_train, X_val, y_train, y_val)
    print("Max leaves: %d \t\t Accuracy Score: %.2f%%" %(max_leaves, acc))

Max leaves: 10 		 Accuracy Score: 99.05%
Max leaves: 50 		 Accuracy Score: 99.05%
Max leaves: 100 		 Accuracy Score: 99.05%
Max leaves: 500 		 Accuracy Score: 99.05%
Max leaves: 5000 		 Accuracy Score: 99.05%
