In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
df = pd.read_csv('md2.csv', index_col='ID')
df.head()

Unnamed: 0_level_0,Age,Age_Group,Education,High_Education,Marital_Status,Relationship,Income,Income_Group,Kidhome,Teenhome,...,NumStorePurchases,Total_NumPurchases,NumWebVisitsMonth,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1173,75,70-80,Master,1,Together,1,77632.0,75000 - 79000,0,0,...,6,25,2,0,0,0,1,1,1,0
6036,63,60-70,Master,1,Together,1,89120.0,87000 - 91000,0,0,...,7,18,1,0,0,0,0,1,0,0
10140,39,30-40,PhD,1,Together,1,70123.0,67000 - 71000,0,0,...,4,21,3,0,1,0,1,0,0,0
5386,69,60-70,Graduation,0,Together,1,94384.0,91000 - 95000,0,0,...,5,43,2,1,0,0,1,1,1,0
5831,55,50-60,Graduation,0,Married,1,77870.0,75000 - 79000,0,1,...,5,22,8,1,0,0,1,0,1,0


In [3]:
df_c2 = df[['Age', 'Education', 'Marital_Status','Income', 'Kidhome', 'Teenhome', 'AcceptedCmp2']]
df_c2.head()

Unnamed: 0_level_0,Age,Education,Marital_Status,Income,Kidhome,Teenhome,AcceptedCmp2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1173,75,Master,Together,77632.0,0,0,0
6036,63,Master,Together,89120.0,0,0,0
10140,39,PhD,Together,70123.0,0,0,1
5386,69,Graduation,Together,94384.0,0,0,0
5831,55,Graduation,Married,77870.0,0,1,0


In [4]:
X = pd.get_dummies(df_c2.drop('AcceptedCmp2', axis=1))
y = df_c2['AcceptedCmp2']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [7]:
feature_names = X.columns
feature_names

Index(['Age', 'Income', 'Kidhome', 'Teenhome', 'Education_2n Cycle',
       'Education_Basic', 'Education_Graduation', 'Education_Master',
       'Education_PhD', 'Marital_Status_Absurd', 'Marital_Status_Alone',
       'Marital_Status_Divorced', 'Marital_Status_Married',
       'Marital_Status_Single', 'Marital_Status_Together',
       'Marital_Status_Widow', 'Marital_Status_YOLO'],
      dtype='object')

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, 
                   feature_names=feature_names, 
                   class_names=['0','1'], 
                   filled=True, 
                   fontsize=12)

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

In [None]:
print("Model Accuracy: %.3f%%" % (model.score(X_test, y_test)*100.0))

In [None]:
path =  model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

models = []

for ccp_alpha in ccp_alphas:
    model = tree.DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    model.fit(X_train, y_train)
    models.append(model)

In [None]:
train_scores = [model.score(X_train, y_train) for model in models]
test_scores = [model.score(X_test, y_test) for model in models]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.plot (ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot (ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
alpah_loop_values = []

for ccp_alpha in ccp_alphas:
    model = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores =  cross_val_score(model, X_train, y_train, cv=5)
    alpah_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])
    
alpah_results = pd.DataFrame(alpah_loop_values, columns=['alpah', 'mean_accuracy', 'std'])

alpah_results.plot(x='alpah',
                  y='mean_accuracy',
                  yerr='std',
                  marker='o',
                  linestyle="--")

In [None]:
ideal_alpah = alpah_results[alpah_results['alpah'] > 0.0008]['alpah']
ideal_alpah

In [None]:
ideal_alpah = float(ideal_alpah)
ideal_alpah

In [None]:
model_pruned = tree.DecisionTreeClassifier(ccp_alpha=ideal_alpah)
model_pruned.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model_pruned, 
                   feature_names=feature_names, 
                   class_names=['0','1'], 
                   filled=True, 
                   fontsize=12)

In [None]:
ConfusionMatrixDisplay.from_estimator(model_pruned, X_test, y_test)

In [None]:
print("Model Accuracy: %.3f%%" % (model_pruned.score(X_test, y_test)*100.0))

In [None]:
model_pruned.feature_importances_

In [None]:
feature_importances = pd.DataFrame(model_pruned.feature_importances_, index=feature_names)
feature_importances.plot(kind='barh')