# 🌳 Notebook 08: CART Decision Trees
## Intelligent Agriculture - Crop Recommendation System

**Objectives:**
1. Implement Classification and Regression Trees (CART)
2. Decision tree visualization and interpretation
3. Feature importance analysis
4. Tree pruning and optimization
5. Compare different splitting criteria
6. Rule extraction from decision trees

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import export_text, plot_tree, export_graphviz
from sklearn.model_selection import GridSearchCV, validation_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [2]:
# Load preprocessed data
train_data = pd.read_csv('../data/processed/train.csv')
val_data = pd.read_csv('../data/processed/validation.csv')
test_data = pd.read_csv('../data/processed/test.csv')
scaler = joblib.load('../data/processed/scaler.pkl')  # FIX: Load scaler
label_encoder = joblib.load('../data/processed/label_encoder.pkl')

print(f"✅ Data loaded: {len(train_data)} train, {len(val_data)} val, {len(test_data)} test")

# Prepare features and targets
feature_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
X_train = train_data[feature_cols]
X_val = val_data[feature_cols]
X_test = test_data[feature_cols]
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

# Scale features (for consistency, even though trees are scale-invariant)
# This ensures we use numpy arrays without feature names
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

target_names = list(label_encoder.classes_)
print(f"Number of classes: {len(target_names)}")
print(f"Feature columns: {feature_cols}")
print(f"Target classes: {target_names[:5]}...")  # Show first 5 classes

✅ Data loaded: 1540 train, 330 val, 330 test
Number of classes: 22
Feature columns: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
Target classes: ['apple', 'banana', 'blackgram', 'chickpea', 'coconut']...


In [3]:
# Basic Decision Tree Implementation
print("🌳 BASIC DECISION TREE CLASSIFIER")
print("=" * 50)

# Create basic decision tree
dt_basic = DecisionTreeClassifier(
    random_state=42,
    criterion='gini'
)
dt_basic.fit(X_train_scaled, y_train)  # FIX: Use scaled data

# Predictions
y_val_pred_basic = dt_basic.predict(X_val_scaled)  # FIX: Use scaled data
accuracy_basic = accuracy_score(y_val, y_val_pred_basic)

print(f"Basic Decision Tree Accuracy: {accuracy_basic:.4f}")
print(f"Tree depth: {dt_basic.get_depth()}")
print(f"Number of leaves: {dt_basic.get_n_leaves()}")
print(f"Number of nodes: {dt_basic.tree_.node_count}")

# Feature importance
feature_importance_basic = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': dt_basic.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance (Basic Tree):")
print(feature_importance_basic)

🌳 BASIC DECISION TREE CLASSIFIER
Basic Decision Tree Accuracy: 0.9909
Tree depth: 17
Number of leaves: 37
Number of nodes: 73

Feature Importance (Basic Tree):
       Feature  Importance
6     rainfall    0.340604
4     humidity    0.219476
1            P    0.218695
0            N    0.107476
2            K    0.101976
5           ph    0.005930
3  temperature    0.005842


In [4]:
# Compare different splitting criteria
print("\n📊 COMPARING SPLITTING CRITERIA")
print("=" * 50)

criteria = ['gini', 'entropy']
criterion_results = []

for criterion in criteria:
    print(f"\nTesting {criterion.upper()} criterion:")
    
    dt = DecisionTreeClassifier(
        criterion=criterion,
        random_state=42,
        max_depth=10  # Limit depth for comparison
    )
    dt.fit(X_train_scaled, y_train)  # FIX: Use scaled data
    
    y_val_pred = dt.predict(X_val_scaled)  # FIX: Use scaled data
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='weighted')
    recall = recall_score(y_val, y_val_pred, average='weighted')
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    criterion_results.append({
        'Criterion': criterion,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1,
        'Tree_Depth': dt.get_depth(),
        'Num_Leaves': dt.get_n_leaves()
    })
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Tree depth: {dt.get_depth()}")
    print(f"  Number of leaves: {dt.get_n_leaves()}")

criterion_df = pd.DataFrame(criterion_results)
print("\nCriterion Comparison:")
print(criterion_df)


📊 COMPARING SPLITTING CRITERIA

Testing GINI criterion:
  Accuracy: 0.9848
  Tree depth: 10
  Number of leaves: 29

Testing ENTROPY criterion:
  Accuracy: 0.9879
  Tree depth: 10
  Number of leaves: 39

Criterion Comparison:
  Criterion  Accuracy  Precision    Recall  F1_Score  Tree_Depth  Num_Leaves
0      gini  0.984848   0.988636  0.984848  0.984416          10          29
1   entropy  0.987879   0.988971  0.987879  0.987845          10          39


In [None]:
# Hyperparameter tuning for optimal tree
print("\n⚙️ HYPERPARAMETER TUNING")
print("=" * 50)

# Define parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}

print("Performing grid search...")
dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
dt_grid.fit(X_train_scaled, y_train)  # FIX: Use scaled data

print(f"Best parameters: {dt_grid.best_params_}")
print(f"Best cross-validation score: {dt_grid.best_score_:.4f}")

# Best decision tree
dt_best = dt_grid.best_estimator_
y_val_pred_best = dt_best.predict(X_val_scaled)  # FIX: Use scaled data
accuracy_best = accuracy_score(y_val, y_val_pred_best)

print(f"\nBest Decision Tree:")
print(f"  Validation accuracy: {accuracy_best:.4f}")
print(f"  Tree depth: {dt_best.get_depth()}")
print(f"  Number of leaves: {dt_best.get_n_leaves()}")
print(f"  Number of nodes: {dt_best.tree_.node_count}")


⚙️ HYPERPARAMETER TUNING
Performing grid search...


In [None]:
# Validation curves for key hyperparameters
print("\n📈 VALIDATION CURVES")
print("=" * 50)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Max depth validation curve
max_depths = range(1, 21)
train_scores, val_scores = validation_curve(
    DecisionTreeClassifier(random_state=42),
    X_train, y_train,
    param_name='max_depth',
    param_range=max_depths,
    cv=5, scoring='accuracy', n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

axes[0, 0].plot(max_depths, train_mean, 'o-', color='blue', label='Training Score')
axes[0, 0].fill_between(max_depths, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
axes[0, 0].plot(max_depths, val_mean, 'o-', color='red', label='Validation Score')
axes[0, 0].fill_between(max_depths, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
axes[0, 0].set_xlabel('Max Depth')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_title('Validation Curve - Max Depth')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Min samples split validation curve
min_samples_splits = [2, 5, 10, 20, 50, 100]
train_scores, val_scores = validation_curve(
    DecisionTreeClassifier(random_state=42),
    X_train, y_train,
    param_name='min_samples_split',
    param_range=min_samples_splits,
    cv=5, scoring='accuracy', n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)

axes[0, 1].plot(min_samples_splits, train_mean, 'o-', color='blue', label='Training Score')
axes[0, 1].plot(min_samples_splits, val_mean, 'o-', color='red', label='Validation Score')
axes[0, 1].set_xlabel('Min Samples Split')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Validation Curve - Min Samples Split')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xscale('log')

# Feature importance comparison
feature_importance_best = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': dt_best.feature_importances_
}).sort_values('Importance', ascending=True)

axes[1, 0].barh(feature_importance_best['Feature'], feature_importance_best['Importance'], color='skyblue')
axes[1, 0].set_xlabel('Feature Importance')
axes[1, 0].set_title('Feature Importance - Best Decision Tree')
axes[1, 0].grid(True, alpha=0.3)

# Tree complexity vs accuracy
complexities = []
accuracies = []
for depth in range(1, 16):
    dt_temp = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt_temp.fit(X_train_scaled, y_train)  # FIX: Use scaled data
    y_pred_temp = dt_temp.predict(X_val_scaled)  # FIX: Use scaled data
    acc_temp = accuracy_score(y_val, y_pred_temp)
    
    complexities.append(dt_temp.get_n_leaves())
    accuracies.append(acc_temp)

axes[1, 1].plot(complexities, accuracies, 'o-', color='green')
axes[1, 1].set_xlabel('Number of Leaves (Complexity)')
axes[1, 1].set_ylabel('Validation Accuracy')
axes[1, 1].set_title('Tree Complexity vs Accuracy')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Visualize decision tree structure
print("\n🌳 DECISION TREE VISUALIZATION")
print("=" * 50)

# Create a smaller tree for visualization
dt_viz = DecisionTreeClassifier(
    max_depth=4,
    min_samples_split=20,
    random_state=42
)
dt_viz.fit(X_train_scaled, y_train)  # FIX: Use scaled data

# Plot the tree
plt.figure(figsize=(20, 12))
plot_tree(dt_viz, 
          feature_names=feature_cols,
          class_names=list(target_names),
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree Visualization (Depth=4)', fontsize=16)
plt.show()

print(f"Visualization tree accuracy: {accuracy_score(y_val, dt_viz.predict(X_val)):.4f}")
print(f"Tree depth: {dt_viz.get_depth()}")
print(f"Number of leaves: {dt_viz.get_n_leaves()}")

In [None]:
# Extract decision rules
print("\n📋 DECISION RULES EXTRACTION")
print("=" * 50)

# Extract text representation of the tree
tree_rules = export_text(dt_viz, feature_names=feature_cols)
print("Decision Tree Rules (First 20 lines):")
print('\n'.join(tree_rules.split('\n')[:20]))
print("...")

# Extract specific rules for top classes
def extract_rules_for_class(tree, feature_names, class_names, target_class):
    """Extract decision rules for a specific class"""
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != -2 else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node, depth, parent_rule=""):
        indent = "  " * depth
        if tree_.feature[node] != -2:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            left_rule = f"{parent_rule} AND {name} <= {threshold:.2f}"
            right_rule = f"{parent_rule} AND {name} > {threshold:.2f}"
            recurse(tree_.children_left[node], depth + 1, left_rule)
            recurse(tree_.children_right[node], depth + 1, right_rule)
        else:
            # Leaf node
            class_counts = tree_.value[node][0]
            predicted_class = np.argmax(class_counts)
            if predicted_class == target_class:
                confidence = class_counts[predicted_class] / np.sum(class_counts)
                rule = parent_rule.strip(" AND ")
                print(f"Rule for {class_names[target_class]}: {rule}")
                print(f"  Confidence: {confidence:.3f}")
                print(f"  Samples: {int(np.sum(class_counts))}")
                print()
    
    recurse(0, 0)

# Extract rules for top 3 most common classes
class_counts = np.bincount(y_train)
top_classes = np.argsort(class_counts)[-3:][::-1]

print("\nDecision Rules for Top 3 Classes:")
for class_idx in top_classes:
    print(f"\n{'='*60}")
    print(f"CLASS: {target_names[class_idx]}")
    print(f"{'='*60}")
    extract_rules_for_class(dt_viz, feature_cols, target_names, class_idx)

In [None]:
# Pruning analysis
print("\n✂️ TREE PRUNING ANALYSIS")
print("=" * 50)

# Cost complexity pruning
dt_full = DecisionTreeClassifier(random_state=42)
path = dt_full.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

print(f"Number of alpha values for pruning: {len(ccp_alphas)}")

# Train trees with different alpha values
clfs = []
train_scores = []
val_scores = []

for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
    clf.fit(X_train_scaled, y_train)  # FIX: Use scaled data
    clfs.append(clf)
    
    train_score = clf.score(X_train_scaled, y_train)  # FIX: Use scaled data
    val_score = clf.score(X_val_scaled, y_val)  # FIX: Use scaled data
    train_scores.append(train_score)
    val_scores.append(val_score)

# Find optimal alpha
best_alpha_idx = np.argmax(val_scores)
best_alpha = ccp_alphas[best_alpha_idx]
best_val_score = val_scores[best_alpha_idx]

print(f"Best alpha: {best_alpha:.6f}")
print(f"Best validation score: {best_val_score:.4f}")

# Create pruned tree
dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
dt_pruned.fit(X_train_scaled, y_train)  # FIX: Use scaled data

print(f"\nPruned tree characteristics:")
print(f"  Depth: {dt_pruned.get_depth()}")
print(f"  Leaves: {dt_pruned.get_n_leaves()}")
print(f"  Nodes: {dt_pruned.tree_.node_count}")

# Visualize pruning effect
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Alpha vs accuracy
axes[0].plot(ccp_alphas, train_scores, marker='o', label='Training', alpha=0.8)
axes[0].plot(ccp_alphas, val_scores, marker='s', label='Validation', alpha=0.8)
axes[0].axvline(x=best_alpha, color='red', linestyle='--', label=f'Best Alpha: {best_alpha:.6f}')
axes[0].set_xlabel('Alpha (Complexity Parameter)')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Cost Complexity Pruning')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xscale('log')

# Tree size vs alpha
tree_sizes = [clf.tree_.node_count for clf in clfs]
axes[1].plot(ccp_alphas, tree_sizes, marker='o', color='green')
axes[1].axvline(x=best_alpha, color='red', linestyle='--', label=f'Best Alpha: {best_alpha:.6f}')
axes[1].set_xlabel('Alpha (Complexity Parameter)')
axes[1].set_ylabel('Number of Nodes')
axes[1].set_title('Tree Size vs Alpha')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Final evaluation on test set
print("\n🎯 FINAL EVALUATION ON TEST SET")
print("=" * 50)

# Test all decision tree variants
dt_models = {
    'Basic Tree': dt_basic,
    'Best Tuned Tree': dt_best,
    'Pruned Tree': dt_pruned,
    'Visualization Tree': dt_viz
}

test_results = []
for name, model in dt_models.items():
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    test_results.append({
        'Model': name,
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1_Score': test_f1,
        'Tree_Depth': model.get_depth(),
        'Num_Leaves': model.get_n_leaves(),
        'Num_Nodes': model.tree_.node_count
    })
    
    print(f"\n{name}:")
    print(f"  Accuracy: {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall: {test_recall:.4f}")
    print(f"  F1-Score: {test_f1:.4f}")
    print(f"  Tree Depth: {model.get_depth()}")
    print(f"  Leaves: {model.get_n_leaves()}")

# Create results DataFrame
results_df = pd.DataFrame(test_results)
print("\n📋 Test Results Summary:")
print(results_df)

# Find best decision tree model
best_dt_idx = results_df['Accuracy'].idxmax()
best_dt_name = results_df.loc[best_dt_idx, 'Model']
best_dt_accuracy = results_df.loc[best_dt_idx, 'Accuracy']

print(f"\n🏆 Best Decision Tree: {best_dt_name} (Accuracy: {best_dt_accuracy:.4f})")

In [None]:
# Confusion matrix for best model
best_model = dt_models[best_dt_name]
y_test_pred_best = best_model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_test_pred_best)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title(f'Confusion Matrix - {best_dt_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Detailed classification report
print("\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_test_pred_best, target_names=list(target_names)))

In [None]:
# Save decision tree models and results
print("\n💾 SAVING DECISION TREE MODELS AND RESULTS")
print("=" * 50)

# Save all decision tree models
joblib.dump(dt_basic, '../models/saved_models/decision_tree_basic.pkl')
joblib.dump(dt_best, '../models/saved_models/decision_tree_best.pkl')
joblib.dump(dt_pruned, '../models/saved_models/decision_tree_pruned.pkl')
joblib.dump(dt_viz, '../models/saved_models/decision_tree_viz.pkl')

# Save results
results_df.to_csv('../models/saved_models/decision_tree_results.csv', index=False)
criterion_df.to_csv('../models/saved_models/decision_tree_criteria_comparison.csv', index=False)

# Save feature importance
all_feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Basic_Tree': dt_basic.feature_importances_,
    'Best_Tree': dt_best.feature_importances_,
    'Pruned_Tree': dt_pruned.feature_importances_,
    'Viz_Tree': dt_viz.feature_importances_
})
all_feature_importance.to_csv('../models/saved_models/decision_tree_feature_importance.csv', index=False)

# Save tree rules
with open('../models/saved_models/decision_tree_rules.txt', 'w') as f:
    f.write(f"Decision Tree Rules - {best_dt_name}\n")
    f.write("=" * 50 + "\n")
    f.write(export_text(best_model, feature_names=feature_cols))

# Save decision tree summary
dt_summary = {
    'Best_Model': best_dt_name,
    'Best_Accuracy': best_dt_accuracy,
    'Basic_Accuracy': accuracy_basic,
    'Best_Tuned_Accuracy': accuracy_best,
    'Pruned_Accuracy': accuracy_score(y_test, dt_pruned.predict(X_test)),
    'Best_Tree_Depth': best_model.get_depth(),
    'Best_Tree_Leaves': best_model.get_n_leaves(),
    'Best_Tree_Nodes': best_model.tree_.node_count
}

summary_df = pd.DataFrame([dt_summary])
summary_df.to_csv('../models/saved_models/decision_tree_summary.csv', index=False)

print("✅ Decision tree models saved to: data/processed/")
print("✅ Results saved to: models/saved_models/decision_tree_results.csv")
print("✅ Feature importance saved to: models/saved_models/decision_tree_feature_importance.csv")
print("✅ Tree rules saved to: models/saved_models/decision_tree_rules.txt")
print("✅ Summary saved to: models/saved_models/decision_tree_summary.csv")

print("\n🎯 KEY INSIGHTS:")
print(f"• Best decision tree: {best_dt_name} with {best_dt_accuracy:.1%} accuracy")
print(f"• Tree depth: {best_model.get_depth()} levels")
print(f"• Number of decision nodes: {best_model.tree_.node_count}")
print(f"• Most important features: {', '.join(feature_importance_best.tail(3)['Feature'].tolist())}")
print(f"• Decision trees provide interpretable rules for crop recommendation")

print("\n🚀 Next: Open notebook 09_DBSCAN_Clustering.ipynb")