In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Load the MNIST dataset
mnist = fetch_openml('mnist_784')

# Step 2: Divide the data into 5 partitions
X_train_parts = []
y_train_parts = []
X_test_parts = []
y_test_parts = []

for i in range(5):
    X_train_part, X_test_part, y_train_part, y_test_part = train_test_split(
        mnist.data, mnist.target, test_size=0.2, random_state=i)
    X_train_parts.append(X_train_part)
    y_train_parts.append(y_train_part)
    X_test_parts.append(X_test_part)
    y_test_parts.append(y_test_part)

# Step 3-7: For each partition, perform K-means clustering, take centroids, and train Decision Tree with pruning
for i in range(5):
    print(f"Partition {i + 1}:")

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(X_train_parts[i])

    # Take centroids from three clusters
    centroids = kmeans.cluster_centers_[:3]

    # Initialize a list to store models for each centroid
    models = []

    # Train Decision Tree for each centroid
    for centroid in centroids:
        # Perform K-nearest neighbors on the cluster
        cluster_indices = np.argwhere(np.all(kmeans.cluster_centers_ == centroid, axis=1)).flatten()
        X_cluster = X_train_parts[i][np.isin(kmeans.labels_, cluster_indices)]
        y_cluster = y_train_parts[i][np.isin(kmeans.labels_, cluster_indices)]

        # Initialize Decision Tree Classifier with pruning
        dt_classifier = DecisionTreeClassifier(random_state=42, ccp_alpha=0.005) # Pruning parameter

        # Define parameter grid for grid search
        param_grid = {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        # Grid search with cross-validation
        grid_search = GridSearchCV(dt_classifier, param_grid, cv=3, scoring='accuracy')
        grid_search.fit(X_cluster, y_cluster)

        # Save the best model
        models.append(grid_search.best_estimator_)

    # Evaluate the models on the test set
    for j, model in enumerate(models):
        y_pred = model.predict(X_test_parts[i])
        acc = accuracy_score(y_test_parts[i], y_pred)
        print(f"Accuracy for centroid {j+1}: {acc}")


  warn(


Partition 1:




Accuracy for centroid 1: 0.30164285714285716
Accuracy for centroid 2: 0.40585714285714286
Accuracy for centroid 3: 0.2777857142857143
Partition 2:




Accuracy for centroid 1: 0.18792857142857142
Accuracy for centroid 2: 0.26535714285714285
Accuracy for centroid 3: 0.26771428571428574
Partition 3:




Accuracy for centroid 1: 0.2455
Accuracy for centroid 2: 0.28464285714285714
Accuracy for centroid 3: 0.26871428571428574
Partition 4:




Accuracy for centroid 1: 0.2917142857142857
Accuracy for centroid 2: 0.3605
Accuracy for centroid 3: 0.31335714285714283
Partition 5:




Accuracy for centroid 1: 0.2522142857142857
Accuracy for centroid 2: 0.3375
Accuracy for centroid 3: 0.2636428571428571


In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Step 1: Load the MNIST dataset
mnist = fetch_openml('mnist_784')

# Step 2: Divide the data into 5 partitions
X_train_partitions, X_test_partitions, y_train_partitions, y_test_partitions = [], [], [], []
for _ in range(5):
    X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(
        mnist.data, mnist.target, test_size=0.2, random_state=42
    )
    X_train_partitions.append(X_train_partition)
    X_test_partitions.append(X_test_partition)
    y_train_partitions.append(y_train_partition)
    y_test_partitions.append(y_test_partition)

# Step 3-7: Perform K-means clustering and Decision Tree for each partition
for i in range(5):
    # Step 4: Perform K-means clustering
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(X_train_partitions[i])

    # Step 5: Take centroids from 3 clusters
    centroids = kmeans.cluster_centers_[:3]

    # Step 6: Perform Decision Tree with best pruned tree and grid search with cross-validation
    best_model = None
    best_accuracy = 0
    for centroid in centroids:
        # Initialize Decision Tree Classifier
        dt_classifier = DecisionTreeClassifier(random_state=42)

        # Define parameter grid for grid search
        param_grid = {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        # Grid search with cross-validation
        grid_search = GridSearchCV(dt_classifier, param_grid, cv=3, scoring='accuracy')
        grid_search.fit(X_train_partitions[i], y_train_partitions[i])

        # Evaluate on validation set
        y_pred_valid = grid_search.predict(X_test_partitions[i])
        accuracy = accuracy_score(y_test_partitions[i], y_pred_valid)

        # Check if this model is the best so far
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = grid_search.best_estimator_

    # Print accuracy for the partition
    print(f"Accuracy for Partition {i+1}: {best_accuracy}")


  warn(


Accuracy for Partition 1: 0.8750714285714286




Accuracy for Partition 2: 0.8750714285714286




Accuracy for Partition 3: 0.8750714285714286




Accuracy for Partition 4: 0.8750714285714286




Accuracy for Partition 5: 0.8750714285714286


In [None]:
# prompt: Accuracy, presition, recall for all pertision

from sklearn.metrics import accuracy_score, precision_score, recall_score

# Calculate accuracy, precision, and recall for each partition
for i in range(5):
    y_pred = best_model.predict(X_test_partitions[i])
    accuracy = accuracy_score(y_test_partitions[i], y_pred)
    precision = precision_score(y_test_partitions[i], y_pred, average='macro')
    recall = recall_score(y_test_partitions[i], y_pred, average='macro')

    # Print the metrics
    print(f"Partition {i+1}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")


Partition 1:
Accuracy: 0.8750714285714286
Precision: 0.8730845557012517
Recall: 0.8732717822047737
Partition 2:
Accuracy: 0.8750714285714286
Precision: 0.8730845557012517
Recall: 0.8732717822047737
Partition 3:
Accuracy: 0.8750714285714286
Precision: 0.8730845557012517
Recall: 0.8732717822047737
Partition 4:
Accuracy: 0.8750714285714286
Precision: 0.8730845557012517
Recall: 0.8732717822047737
Partition 5:
Accuracy: 0.8750714285714286
Precision: 0.8730845557012517
Recall: 0.8732717822047737
