In [2]:
# Step 1: Load the MNIST dataset and split it
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target

# Split the dataset into training (60,000) and test (10,000) sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]



In [3]:
# Step 2: Train a Decision Tree classifier and evaluate it
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Train the model
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = dt_clf.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)
print(f"Accuracy of Decision Tree classifier (all features): {accuracy_dt:.4f}")

Accuracy of Decision Tree classifier (all features): 0.8755


In [4]:
# Step 3: Perform Sequential Forward Selection (SFS) to select the best 20 features
from sklearn.feature_selection import SequentialFeatureSelector

# Initialize the SFS with a Decision Tree classifier
sfs = SequentialFeatureSelector(DecisionTreeClassifier(random_state=42), 
                                n_features_to_select=20, 
                                direction='forward', 
                                scoring='accuracy', 
                                cv=3,
                                n_jobs=-1)

# Fit SFS on the training data
sfs.fit(X_train, y_train)


In [5]:
# Get the selected features
selected_features = sfs.get_support(indices=True)
X_train_reduced = X_train.iloc[:, selected_features]
X_test_reduced = X_test.iloc[:, selected_features]

In [6]:
# Step 4: Train another Decision Tree classifier using the selected features
reduced_dt_clf = DecisionTreeClassifier(random_state=42)
reduced_dt_clf.fit(X_train_reduced, y_train)

# Evaluate the reduced model on the test set
y_pred_reduced = reduced_dt_clf.predict(X_test_reduced)
accuracy_reduced_dt = accuracy_score(y_test, y_pred_reduced)
print(f"Accuracy of Decision Tree classifier (reduced features): {accuracy_reduced_dt:.4f}")

Accuracy of Decision Tree classifier (reduced features): 0.8324


In [7]:
# Step 5: Compare models in terms of training time and accuracy
import time

# Measure training time for the original model
start_time = time.time()
dt_clf.fit(X_train, y_train)
original_training_time = time.time() - start_time

# Measure training time for the reduced model
start_time = time.time()
reduced_dt_clf.fit(X_train_reduced, y_train)
reduced_training_time = time.time() - start_time

print(f"Training time (all features): {original_training_time:.2f} seconds")
print(f"Training time (reduced features): {reduced_training_time:.2f} seconds")

Training time (all features): 16.79 seconds
Training time (reduced features): 0.71 seconds


In the last part we ran, we compared the training times between all features and reduced features. The training time of the model trained with reduced features was shorter than the model trained with all features. Because it uses fewer features, its complexity has decreased and its duration has decreased. However, we spent extra time to determine which features to select with the SFS algorithm, and this took approximately 2 hours. In this process, instead of spending 2 hours to select 20 features, we could do our training directly. Therefore, our cost would be lower. In this case, when performing feature selection with SFS, we must decide how effective it will be in our problem and act accordingly.