In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics

import graphviz

import joblib

In [2]:
df = pd.read_csv('../data/b2c_customers_100.csv')
df[:10]

Unnamed: 0,age,gender,employment_status,occupation,education,household_size,has_children,monthly_income_sgd,preferred_category
0,40,Female,Full-time,Sales,Diploma,1,0,8393.671183,Fashion - Women
1,28,Female,Full-time,Service,Bachelor,2,0,2221.219795,Beauty & Personal Care
2,32,Female,Full-time,Service,Diploma,1,0,3628.458357,Fashion - Women
3,35,Female,Full-time,Admin,Diploma,1,0,2191.616221,Beauty & Personal Care
4,54,Male,Full-time,Tech,Bachelor,2,0,16394.464195,Electronics
5,32,Male,Full-time,Sales,Diploma,1,0,8045.020407,Sports & Outdoors
6,35,Male,Full-time,Service,Bachelor,2,0,8437.360362,Fashion - Men
7,30,Female,Part-time,Service,Diploma,2,0,2587.660625,Beauty & Personal Care
8,20,Male,Student,Education,Secondary,2,0,766.440873,Books
9,49,Male,Full-time,Admin,Bachelor,5,1,5285.469754,Home & Kitchen


In [3]:
# convert gender categorical variable to numerical variable with one-hot encoding
df = pd.get_dummies(df, columns=['gender', 'employment_status', 'occupation', 'education'])
df


Unnamed: 0,age,household_size,has_children,monthly_income_sgd,preferred_category,gender_Female,gender_Male,employment_status_Full-time,employment_status_Part-time,employment_status_Retired,...,occupation_Education,occupation_Sales,occupation_Service,occupation_Skilled Trades,occupation_Tech,education_Bachelor,education_Diploma,education_Doctorate,education_Master,education_Secondary
0,40,1,0,8393.671183,Fashion - Women,True,False,True,False,False,...,False,True,False,False,False,False,True,False,False,False
1,28,2,0,2221.219795,Beauty & Personal Care,True,False,True,False,False,...,False,False,True,False,False,True,False,False,False,False
2,32,1,0,3628.458357,Fashion - Women,True,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False
3,35,1,0,2191.616221,Beauty & Personal Care,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,54,2,0,16394.464195,Electronics,False,True,True,False,False,...,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,21,2,0,648.536195,Books,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
96,19,4,0,921.911241,Books,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
97,34,1,0,2128.792525,Beauty & Personal Care,True,False,True,False,False,...,False,False,True,False,False,True,False,False,False,False
98,39,1,0,8497.530212,Fashion - Women,True,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False


In [4]:
X = df.drop('preferred_category', axis=1)
y = df['preferred_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, stratify=y)

In [5]:
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5)

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)

print('Training Accuracy = {}'.format(metrics.accuracy_score(y_train, y_train_pred)))

Training Accuracy = 0.9571428571428572


In [6]:
print('Training Confusion = \n{}'.format(metrics.confusion_matrix(y_train, y_train_pred, labels=clf.classes_)))

Training Confusion = 
[[9 0 0 0 0 0 0 0]
 [0 8 0 0 0 0 0 0]
 [0 0 9 0 0 0 0 0]
 [0 0 0 9 0 0 0 0]
 [1 0 0 0 8 0 0 0]
 [0 0 0 0 0 8 0 0]
 [0 0 0 0 0 0 9 0]
 [0 0 0 2 0 0 0 7]]


In [7]:
y_test_pred = clf.predict(X_test)

print('Testing Accuracy = {}'.format(metrics.accuracy_score(y_test, y_test_pred)))

Testing Accuracy = 0.7


In [8]:
print('Testing Confusion = \n{}'.format(metrics.confusion_matrix(y_test, y_test_pred, labels=clf.classes_)))

Testing Confusion = 
[[3 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0]
 [0 0 4 0 0 0 0 0]
 [0 0 0 3 0 0 0 1]
 [1 0 0 0 3 0 0 0]
 [0 0 0 0 0 2 2 0]
 [0 0 0 0 0 2 2 0]
 [0 0 0 3 0 0 0 0]]


In [9]:
# Generate the tree plot
# dot_data = tree.export_graphviz(clf, out_file=None,
#                                 feature_names=X.columns, class_names=clf.classes_,
#                                 filled=True, rounded=True, special_characters=True)
# graph = graphviz.Source(dot_data)
# graph

In [10]:
# Save the model to a file using joblib
joblib.dump(clf, 'b2c_customers_100.joblib')

['b2c_customers_100.joblib']

In [11]:
# Load the model from the file using joblib
loaded_model = joblib.load('b2c_customers_100.joblib')

In [12]:
# Parameters of the loaded model
for param, value in loaded_model.get_params().items():
    print(f"{param}: {value}")

ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: 5
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
monotonic_cst: None
random_state: None
splitter: best


In [13]:
# Print feature importances
print("Feature importances:", loaded_model.feature_importances_)

Feature importances: [0.07070659 0.15457617 0.         0.45478124 0.         0.16026827
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.15966773 0.
 0.         0.         0.         0.        ]


In [14]:
# Example raw input data as a dictionary
raw_input = {
    'age': 29,
    'household_size': 2,
    'has_children': 1,
    'monthly_income_sgd': 5000,
    'gender': 'Female',
    'employment_status': 'Full-time',
    'occupation': 'Sales',
    'education': 'Bachelor'
}

# Convert raw input to DataFrame
input_df = pd.DataFrame([raw_input])

# One-hot encode categorical variables to match training data columns
input_encoded = pd.get_dummies(input_df, columns=['gender', 'employment_status', 'occupation', 'education'])

# Ensure all required columns are present, add missing columns as False/0
for col in X.columns:
    if col not in input_encoded.columns:
        # Use False for bool columns, 0 for numeric
        if X[col].dtype == bool:
            input_encoded[col] = False
        else:
            input_encoded[col] = 0

# Reorder columns to match training data
input_encoded = input_encoded[X.columns]

# Now input_encoded can be used for prediction
prediction = loaded_model.predict(input_encoded)
print('Prediction:', prediction)

Prediction: ['Beauty & Personal Care']


In [None]:
# Example raw input data as a dictionary
raw_input = {
    'age': 50,
    'household_size': 2,
    'has_children': 1,
    'monthly_income_sgd': 18000,
    'gender': 'Male',
    'employment_status': 'Full-time',
    'occupation': 'Tech',
    'education': 'Bachelor'
}

# Convert raw input to DataFrame
input_df = pd.DataFrame([raw_input])

# One-hot encode categorical variables to match training data columns
input_encoded = pd.get_dummies(input_df, columns=['gender', 'employment_status', 'occupation', 'education'])

# Ensure all required columns are present, add missing columns as False/0
for col in X.columns:
    if col not in input_encoded.columns:
        # Use False for bool columns, 0 for numeric
        if X[col].dtype == bool:
            input_encoded[col] = False
        else:
            input_encoded[col] = 0

# Reorder columns to match training data
input_encoded = input_encoded[X.columns]

# Now input_encoded can be used for prediction
prediction = loaded_model.predict(input_encoded)
print('Prediction:', prediction)

Prediction: ['Electronics']


In [None]:
import joblib

# Use your actual preprocessor variable name
# Save it to a NEW file
joblib.dump(preprocessor, 'customer_preprocessor.joblib')