Import necessary libraries

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

Generate synthetic binary classification dataset using numpy. Create a dataset of 1000 samples with two features

In [None]:
np.random.seed(42)

X = np.random.rand(1000, 2)  # Feature values between 0 and 1 (2 features)
y = (2 * X[:, 0] + 3 * X[:, 1] > 2).astype(int)  # Class label: 1 if the linear combination is > 2, else 0

Split the data into training and testing sets (80% training, 20% testing)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize Decision Tree Classifier.
We'll use 'entropy' as the criterion to calculate Information Gain, or 'gini' for Gini Index

In [None]:
model = DecisionTreeClassifier(criterion='entropy', random_state=42)

Fit the model using the training data

In [None]:
model.fit(X_train, y_train)

Predict on the test data

In [None]:
y_pred = model.predict(X_test)

Essential Metrics Calculation

Accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.985


Precision

In [None]:
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

Precision: 0.9846153846153847


Recall

In [None]:
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

Recall: 0.9922480620155039


F1 Score

In [None]:
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

F1 Score: 0.9884169884169884


Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[ 69   2]
 [  1 128]]


Decision Tree Details

Get feature importance (shows which features contributed the most to the model)

In [None]:
feature_importances = model.feature_importances_
print("\nFeature Importances:", feature_importances)


Feature Importances: [0.36800496 0.63199504]


Visualize the structure of the tree

In [None]:
tree_rules = export_text(model, feature_names=['Feature 1', 'Feature 2'])
print("\nDecision Tree Structure:\n", tree_rules)


Decision Tree Structure:
 |--- Feature 2 <= 0.40
|   |--- Feature 1 <= 0.77
|   |   |--- Feature 1 <= 0.44
|   |   |   |--- class: 0
|   |   |--- Feature 1 >  0.44
|   |   |   |--- Feature 2 <= 0.20
|   |   |   |   |--- class: 0
|   |   |   |--- Feature 2 >  0.20
|   |   |   |   |--- Feature 2 <= 0.26
|   |   |   |   |   |--- Feature 1 <= 0.65
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- Feature 1 >  0.65
|   |   |   |   |   |   |--- Feature 1 <= 0.68
|   |   |   |   |   |   |   |--- Feature 1 <= 0.66
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- Feature 1 >  0.66
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- Feature 1 >  0.68
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- Feature 2 >  0.26
|   |   |   |   |   |--- Feature 1 <= 0.60
|   |   |   |   |   |   |--- Feature 2 <= 0.30
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- Feature 2 >  0.30
|   |   |   |   |   |   

# Calculate Entropy and Information Gain for the splits
# We can also use the gini criterion by switching 'entropy' to 'gini' when initializing the model

# If you want to fit with 'gini' to calculate Gini Index, you can change the criterion to 'gini' in the model:
# model = DecisionTreeClassifier(criterion='gini', random_state=42)

# Print the Gini index for the tree
print("\nGini Index:", 1 - model.criterion)

# Information Gain can be derived from entropy
# The scikit-learn library computes entropy and information gain internally based on node splits.