In [131]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from desiciontree import DecisionTreeCART

In [132]:
df = pd.read_csv('../dataset/Customer_Segmentation_Classification/train.csv')
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [133]:
na_counts = df.isna().sum()
print(na_counts)

df['Ever_Married'] = df['Ever_Married'].fillna('No')
df['Graduated'] = df['Graduated'].fillna('No')
df['Profession'] = df['Profession'].fillna('Unknown')
df['Work_Experience'] = df['Work_Experience'].fillna(df['Work_Experience'].median())
df['Family_Size'] = df['Family_Size'].fillna(df['Family_Size'].median())
df['Var_1'] = df['Var_1'].fillna('Unknown')

df_processed = pd.get_dummies(df, columns=['Gender', 'Ever_Married', 'Graduated', 'Profession','Spending_Score', 'Var_1'])
df_processed.head()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64


Unnamed: 0,ID,Age,Work_Experience,Family_Size,Segmentation,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,...,Spending_Score_High,Spending_Score_Low,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,Var_1_Unknown
0,462809,22,1.0,4.0,D,False,True,True,False,True,...,False,True,False,False,False,True,False,False,False,False
1,462643,38,1.0,3.0,A,True,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
2,466315,67,1.0,1.0,B,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
3,461735,67,0.0,2.0,B,False,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,462669,40,1.0,6.0,A,True,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False


In [134]:
X = df_processed.drop(['Segmentation','ID'], axis=1).values
y = df_processed['Segmentation'].values
dtree = DecisionTreeCART(max_depth=6)
dtree.fit(X, y)
dtree.importance()
print(dtree.importance())

[(0, 2.03125), (9, 0.75), (21, 0.5), (14, 0.4375), (2, 0.40625), (1, 0.28125), (19, 0.25), (5, 0.1875), (7, 0.1875), (18, 0.125), (25, 0.125), (17, 0.125), (23, 0.0625), (3, 0.0625), (13, 0.03125), (27, 0.03125), (16, 0.03125), (12, 0.03125)]


In [135]:
dtree_sklearn = tree.DecisionTreeClassifier(max_depth=6)
dtree_sklearn.fit(X, y)
dtree_sklearn_importance = dtree_sklearn.feature_importances_
print(dtree_sklearn_importance)

[0.50048086 0.01354092 0.03048689 0.00111213 0.00305147 0.00139521
 0.00272662 0.00390545 0.02478962 0.16008159 0.         0.
 0.00814659 0.00162525 0.04979739 0.         0.00277662 0.0171234
 0.00301626 0.01281663 0.0014048  0.13942783 0.         0.00354196
 0.         0.01121673 0.         0.00671631 0.         0.00081947]


In [136]:
rf_sklearn = RandomForestClassifier(n_estimators=100, max_depth=6)
rf_sklearn.fit(X, y)
rf_sklearn_importance = rf_sklearn.feature_importances_
print(rf_sklearn_importance)

[0.21031529 0.01230935 0.04681693 0.0058114  0.0058036  0.05962264
 0.07877567 0.056868   0.06037671 0.10723232 0.00307566 0.00781275
 0.01094319 0.00253784 0.13299666 0.00226401 0.00346682 0.01156827
 0.00081433 0.03830494 0.00883917 0.1012028  0.00121186 0.00273949
 0.00313983 0.01205643 0.00093981 0.00950334 0.0013858  0.00126508]


In [137]:
test_df = pd.read_csv('../dataset/Customer_Segmentation_Classification/test.csv')
test_na_counts = test_df.isna().sum()
print(test_na_counts)

test_df['Ever_Married'] = test_df['Ever_Married'].fillna('No')
test_df['Graduated'] = test_df['Graduated'].fillna('No')
test_df['Profession'] = test_df['Profession'].fillna('Unknown')
test_df['Work_Experience'] = test_df['Work_Experience'].fillna(test_df['Work_Experience'].median())
test_df['Family_Size'] = test_df['Family_Size'].fillna(test_df['Family_Size'].median())
test_df['Var_1'] = test_df['Var_1'].fillna('Unknown')

test_df_processed = pd.get_dummies(test_df, columns=['Gender', 'Ever_Married', 'Graduated', 'Profession','Spending_Score', 'Var_1'])
test_df_processed.head()

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
Segmentation         0
dtype: int64


Unnamed: 0,ID,Age,Work_Experience,Family_Size,Segmentation,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,...,Spending_Score_High,Spending_Score_Low,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,Var_1_Unknown
0,458989,36,0.0,1.0,B,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
1,458994,37,8.0,4.0,A,False,True,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,458996,69,0.0,1.0,A,True,False,False,True,True,...,False,True,False,False,False,False,False,True,False,False
3,459000,59,11.0,2.0,B,False,True,False,True,True,...,True,False,False,False,False,False,False,True,False,False
4,459001,19,1.0,4.0,A,True,False,True,False,True,...,False,True,False,False,False,False,False,True,False,False


In [138]:
X_test = test_df_processed.drop(['ID','Segmentation'], axis=1).values

y_test = dtree.predict(X_test)
acc = dtree.accuracy(test_df_processed['Segmentation'].values, y_test)
print(f'Test Accuracy: {acc * 100:.2f}%')

Test Accuracy: 34.07%


In [139]:
y_test_dtree_sklearn = dtree_sklearn.predict(X_test)
acc_sklearn = np.sum(y_test_dtree_sklearn == test_df_processed['Segmentation'].values) / len(y_test_dtree_sklearn)
print(f'Sklearn Test Accuracy: {acc_sklearn * 100:.2f}%')

Sklearn Test Accuracy: 34.11%


In [141]:
y_test_rf_sklearn = rf_sklearn.predict(X_test)
acc_rf_sklearn = np.sum(y_test_rf_sklearn == test_df_processed['Segmentation'].values) / len(y_test_rf_sklearn)
print(f'Random Forest Sklearn Test Accuracy: {acc_rf_sklearn * 100:.2f}%')

Random Forest Sklearn Test Accuracy: 33.73%
