<h1 style="color:blue; text-align:center;">Decision Tree</h1>


<h2>Load the Preprocessed Dataset</h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("../Dataset/after_preprocessing.csv")
df

Unnamed: 0,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,0.439024,1,0.24,2,0.455873,0,0,0,0.25,1,...,0.166667,1,0,0.158730,0,0,0,3,2,1
1,0.414634,1,0.12,0,0.223615,2,0,0,0.25,1,...,0.666667,0,1,0.198413,0,0,0,2,0,0
2,0.780488,1,0.12,0,0.291965,1,0,0,0.75,1,...,0.333333,2,1,0.587302,0,0,1,2,1,1
3,0.975610,1,0.86,3,0.288079,1,3,2,0.00,1,...,0.666667,0,1,0.746032,0,0,0,3,1,0
4,0.512195,1,0.46,0,0.226362,2,0,0,0.00,1,...,1.000000,1,0,0.341270,1,0,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73910,0.780488,0,0.22,0,0.213630,1,0,0,0.25,1,...,0.333333,2,2,0.261905,0,0,1,3,3,0
73911,0.000000,1,0.06,2,0.456611,1,0,2,0.75,0,...,0.000000,2,1,0.563492,0,0,0,1,2,0
73912,0.097561,0,0.26,4,0.450178,1,0,2,0.00,1,...,0.333333,0,2,0.214286,0,1,0,2,2,1
73913,0.121951,1,0.14,0,0.114253,1,3,0,0.00,0,...,0.000000,0,0,0.055556,0,0,0,2,1,0


<h2>Split the Dataset into Training and Testing Sets</h2>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(columns=['Attrition'])
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

<h2>Train a Decision Tree Classifier</h2>

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dt_classifier = DecisionTreeClassifier(random_state=42)

In [None]:
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
y_pred.shape

<h2>Evaluate the Decision Tree Classifier</h2>

In [None]:
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('Confusion Matrix:')
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
TN = conf_matrix[0,0]
FP = conf_matrix[0,1]
FN = conf_matrix[1,0]
TP = conf_matrix[1,1]

print("True Negative:",TN)
print("Flase Postive:",FP)
print("Flase Negative:",FN)
print("True Postive:",TP)

In [None]:
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

<h2>Feature Importance in Decision Tree</h2>

In [None]:
importance = dt_classifier.feature_importances_
plt.barh(X.columns, importance)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance in Decision Tree')
plt.show()

<h2>Visualize the Decision Tree</h2>

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(20,10))
plot_tree(dt_classifier, filled=True)
plt.show()

<h2>Train a Decision Tree Classifier with Entropy Criterion</h2>

In [None]:
dt_classifier2 = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_classifier2.fit(X_train, y_train)


In [None]:
y_pred2 = dt_classifier2.predict(X_test)
y_pred2

In [None]:
print(classification_report(y_test,y_pred2))

<h2>Feature Selection with Chi-Square and Mutual Information</h2>

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

In [None]:

k_values = list(range(1, X_train.shape[1] + 1))


accuracy_chi2 = []
accuracy_mi = []


for k in k_values:

    selector_chi2 = SelectKBest(score_func=chi2, k=k)
    X_train_selected_chi2 = selector_chi2.fit_transform(X_train, y_train)
    X_test_selected_chi2 = selector_chi2.transform(X_test)

    dt_classifier = DecisionTreeClassifier(random_state=42)
    dt_classifier.fit(X_train_selected_chi2, y_train)

    y_pred_chi2 = dt_classifier.predict(X_test_selected_chi2)
    accuracy_chi2.append(accuracy_score(y_test, y_pred_chi2))

    selector_mi = SelectKBest(score_func=mutual_info_classif, k=k)
    X_train_selected_mi = selector_mi.fit_transform(X_train, y_train)
    X_test_selected_mi = selector_mi.transform(X_test)

    dt_classifier.fit(X_train_selected_mi, y_train)

    y_pred_mi = dt_classifier.predict(X_test_selected_mi)
    accuracy_mi.append(accuracy_score(y_test, y_pred_mi))

best_k_chi2 = k_values[np.argmax(accuracy_chi2)]
best_k_mi = k_values[np.argmax(accuracy_mi)]

print("Best k for Chi-Square Feature Selection:", best_k_chi2)
print("Best k for Mutual Information Feature Selection:", best_k_mi)

plt.figure(figsize=(10, 5))
plt.plot(k_values, accuracy_chi2, marker='o', label="Chi-Square")
plt.plot(k_values, accuracy_mi, marker='s', label="Mutual Information")
plt.xlabel("Number of Features (k)")
plt.ylabel("Accuracy")
plt.title("Feature Selection Impact on Decision Tree Accuracy")
plt.legend()
plt.show()

In [None]:
k = best_k_chi2
selector_chi2 = SelectKBest(score_func=chi2, k=k)
x_train_selected_chi2 = selector_chi2.fit_transform(X_train, y_train)
x_test_selected_chi2 = selector_chi2.transform(X_test)

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(x_train_selected_chi2, y_train)

y_pred_chi2 = dt_classifier.predict(x_test_selected_chi2)

accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print("Decision Tree Accuracy after Chi-Square feature selection:", accuracy_chi2)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

conf_matrix_chi2 = confusion_matrix(y_test, y_pred_chi2)
print("Confusion Matrix after Chi-Square feature selection:")
print(conf_matrix_chi2)


class_report_chi2 = classification_report(y_test, y_pred_chi2)
print("Classification Report after Chi-Square feature selection:")
print(class_report_chi2)

In [None]:
selected_features_chi2 = selector_chi2.get_support(indices=True)

selected_feature_names_chi2 = X_train.columns[selected_features_chi2]

print("Selected Features (Chi-Square):", list(selected_feature_names_chi2))

In [None]:
k = best_k_mi

selector_mi = SelectKBest(score_func=mutual_info_classif, k=k)
x_train_selected_mi = selector_mi.fit_transform(X_train, y_train)
x_test_selected_mi = selector_mi.transform(X_test)

dt_classifier.fit(x_train_selected_mi, y_train)

y_pred_mi = dt_classifier.predict(x_test_selected_mi)

accuracy_mi = accuracy_score(y_test, y_pred_mi)
print("Decision Tree Accuracy after Mutual Information feature selection:", accuracy_mi)

In [None]:
selected_features_mi = selector_mi.get_support(indices=True)

selected_feature_names_mi = X_train.columns[selected_features_mi]

print("Selected Features (Mutual Information):", list(selected_feature_names_mi))

<h2>ROC Curve for Mutual Information in Decision Tree</h2>

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_pred_proba = dt_classifier.predict_proba(x_test_selected_mi)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC Curve for Decision Tree')
plt.legend()
plt.show()