In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
path = r"C:\Users\Asus\OneDrive\Desktop\Data Analysis Project 2\E-commerce Customer Behavior - Sheet1.csv"

In [4]:
df = pd.read_csv(path)
df = df.set_index("Customer ID")

In [5]:
ml_df = df.copy()

In [6]:
ml_df.head()

Unnamed: 0_level_0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
102,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
103,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
104,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
105,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied


In [7]:
print(ml_df.columns)
print(f"Dataset Size: Total {len(ml_df)} customers")

Index(['Gender', 'Age', 'City', 'Membership Type', 'Total Spend',
       'Items Purchased', 'Average Rating', 'Discount Applied',
       'Days Since Last Purchase', 'Satisfaction Level'],
      dtype='object')
Dataset Size: Total 350 customers


In [8]:
target = 'Satisfaction Level'
ml_df[target].value_counts()

Satisfaction Level
Satisfied      125
Unsatisfied    116
Neutral        107
Name: count, dtype: int64

In [9]:
features = [ 
    'Age',
    'Total Spend',
    'Items Purchased',
    'Average Rating',
    'Days Since Last Purchase',
    'Gender',            #Needs encpding  
    'Membership Type',   #Needs encoding
    'Discount Applied'   #Needs encoding
]

ml_df = ml_df[features + [target]].copy()

print(f"Using {len(features)} features to find satisfaction")

Using 8 features to find satisfaction


## Encoding the Nominal and Ordinal Variables

In [14]:
#For Genders and Discount applied we only have two possible outcomes: Male or Females and Yes or No.
#For these type of 2 outcome Nominal Variables we'll use the: Label Encoding

#Firstly we'll for an array of these 3 columns which we'll encode
categorical_cols = ['Gender', 'Membership Type', 'Discount Applied']

for col in categorical_cols:
    print(f"{col}: {ml_df[col].unique()}")

#Label Encoding
# Gender Encoding -> Male/Female = 0/1
ml_df["Encoded Gender"] = (ml_df['Gender'] == 'Male').astype(int)

# Discount Applied Encoding -> True/False = 1/0
ml_df["Encoded Discount"] = (ml_df['Discount Applied']).astype(int)


# One Hot Encoding 
dummies = pd.get_dummies(ml_df['Membership Type'], prefix='Membership').astype(int)
ml_df = pd.concat([dummies, ml_df], axis = 'columns')


Gender: ['Female' 'Male']
Membership Type: ['Gold' 'Silver' 'Bronze']
Discount Applied: [ True False]


In [15]:
ml_df

Unnamed: 0_level_0,Membership_Bronze,Membership_Gold,Membership_Silver,Age,Total Spend,Items Purchased,Average Rating,Days Since Last Purchase,Gender,Membership Type,Discount Applied,Satisfaction Level,Encoded Gender,Encoded Discount
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
101,0,1,0,29,1120.20,14,4.6,25,Female,Gold,True,Satisfied,0,1
102,0,0,1,34,780.50,11,4.1,18,Male,Silver,False,Neutral,1,0
103,1,0,0,43,510.75,9,3.4,42,Female,Bronze,True,Unsatisfied,0,1
104,0,1,0,30,1480.30,19,4.7,12,Male,Gold,False,Satisfied,1,0
105,0,0,1,27,720.40,13,4.0,55,Male,Silver,True,Unsatisfied,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,0,0,1,32,660.30,10,3.8,42,Male,Silver,True,Unsatisfied,1,1
447,1,0,0,36,470.50,8,3.0,27,Female,Bronze,False,Neutral,0,0
448,0,1,0,30,1190.80,16,4.5,28,Female,Gold,True,Satisfied,0,1
449,0,0,1,34,780.20,11,4.2,21,Male,Silver,False,Neutral,1,0
