In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load Fashion-MNIST dataset
fashion_mnist = fetch_openml('Fashion-MNIST', version=1, as_frame=False)
X = fashion_mnist.data
y = fashion_mnist.target.astype(int)

# Use a smaller subset of the data
subset_size = 10000
X_subset, _, y_subset, _ = train_test_split(X, y, train_size=subset_size, stratify=y, random_state=42)

# Partition the data into multiple subsets
num_partitions = 3
X_partitions = np.array_split(X_subset, num_partitions)
y_partitions = np.array_split(y_subset, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 3  # Reduce the number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y_subset, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


  warn(


Model Accuracy: 0.779


In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load Fashion-MNIST dataset
fashion_mnist = fetch_openml('Fashion-MNIST', version=1, as_frame=False)
X = fashion_mnist.data
y = fashion_mnist.target.astype(int)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X, num_partitions)
y_partitions = np.array_split(y, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Decision Tree Model Accuracy:", accuracy)


  warn(


Decision Tree Model Accuracy: 0.8074285714285714


In [None]:
from google.colab import files
import numpy as np
from scipy.io import loadmat
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Upload files from your local system
uploaded = files.upload()

# Assuming the uploaded file is 'emnist-dataset.mat'
# Replace 'emnist-dataset.mat' with the actual file name if different
file_name = next(iter(uploaded))

# Load the dataset
emnist_data = loadmat(file_name)

# Extract the data
# The 'dataset' key and subkeys may vary depending on how the .mat file is structured
X = emnist_data['dataset']['X']
y = emnist_data['dataset']['y']

# Reshape the data to flatten it
X = X.reshape(X.shape[0], -1)
y = y.flatten()

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X, num_partitions)
y_partitions = np.array_split(y, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets for the centroids
X_train_R, X_test_R, y_train_R, y_test_R = train_test_split(R, y[:len(R)], test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = Decision_TreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_R, y_train_R)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test_R, y_test_R)
print("Decision Tree Model Accuracy on EMNIST:", accuracy)


In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Load STL-10 dataset
stl_10_builder = tfds.builder("stl10")
stl_10_builder.download_and_prepare()
stl_10_dataset = stl_10_builder.as_dataset()

# Access features and labels
train_data = stl_10_dataset['train']
test_data = stl_10_dataset['test']

# Iterate through the dataset to extract features (X) and labels (y)
X_train, y_train = [], []
for example in tfds.as_numpy(train_data):
    X_train.append(example['image'])
    y_train.append(example['label'])

X_test, y_test = [], []
for example in tfds.as_numpy(test_data):
    X_test.append(example['image'])
    y_test.append(example['label'])

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Flatten the images
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train_flat, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the centroid data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(R, y_train, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_val, y_val)
print("Decision Tree Model Accuracy on STL-10 validation set:", accuracy)

# Test the model on the test set
test_accuracy = best_dt_model.score(X_test_flat, y_test)
print("Decision Tree Model Accuracy on STL-10 test set:", test_accuracy)


Downloading and preparing dataset 2.46 GiB (download: 2.46 GiB, generated: 1.86 GiB, total: 4.32 GiB) to /root/tensorflow_datasets/stl10/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/5000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/stl10/1.0.0.incompleteBD5VDG/stl10-train.tfrecord*...:   0%|          | 0/…

Generating test examples...:   0%|          | 0/8000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/stl10/1.0.0.incompleteBD5VDG/stl10-test.tfrecord*...:   0%|          | 0/8…

Generating unlabelled examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/stl10/1.0.0.incompleteBD5VDG/stl10-unlabelled.tfrecord*...:   0%|         …

Dataset stl10 downloaded and prepared to /root/tensorflow_datasets/stl10/1.0.0. Subsequent calls will reuse this data.


  pid = os.fork()
  pid = os.fork()


In [None]:
import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'


In [None]:
import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'

# Load STL-10 dataset
stl_10_builder = tfds.builder("stl10")
stl_10_builder.download_and_prepare()
stl_10_dataset = stl_10_builder.as_dataset()

# Access features and labels
train_data = stl_10_dataset['train']
test_data = stl_10_dataset['test']

# Iterate through the dataset to extract features (X) and labels (y)
X_train, y_train = [], []
for example in tfds.as_numpy(train_data):
    X_train.append(example['image'])
    y_train.append(example['label'])

X_test, y_test = [], []
for example in tfds.as_numpy(test_data):
    X_test.append(example['image'])
    y_test.append(example['label'])

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Flatten the images
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train_flat, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


NameError: name 'tfds' is not defined

In [None]:
!pip install tensorflow-datasets



In [None]:
import tensorflow_datasets as tfds

In [None]:
import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'

# Load STL-10 dataset
stl_10_builder = tfds.builder("stl10")
stl_10_builder.download_and_prepare()
stl_10_dataset = stl_10_builder.as_dataset()

# Access features and labels
train_data = stl_10_dataset['train']
test_data = stl_10_dataset['test']

# Iterate through the dataset to extract features (X) and labels (y)
X_train, y_train = [], []
for example in tfds.as_numpy(train_data):
    X_train.append(example['image'])
    y_train.append(example['label'])

X_test, y_test = [], []
for example in tfds.as_numpy(test_data):
    X_test.append(example['image'])
    y_test.append(example['label'])

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Flatten the images
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train_flat, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)

    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


ValueError: Failed to construct dataset "stl10", builder_kwargs "{}": Name tf.RaggedTensorSpec has already been registered for class tensorflow.python.ops.ragged.ragged_tensor.RaggedTensorSpec.

In [None]:
!pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.17,>=2.16 (from tensorflow)
  Downloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25

In [None]:
import numpy as np
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define transformation to convert images to PyTorch tensors and normalize them
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download QMNIST dataset
train_dataset = datasets.QMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.QMNIST(root='./data', train=False, download=True, transform=transform)

# Extract features and labels from the dataset
X_train = train_dataset.data.numpy()
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy()
y_test = test_dataset.targets.numpy()

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Train KNN on each partition to find the nearest neighbors and compute centroid
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_partition.reshape(len(X_partition), -1), y_partition)
    distances, indices = knn_model.kneighbors(X_partition.reshape(len(X_partition), -1))
    centroid = np.mean(X_partition[indices], axis=1)
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets for dataset R
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(R, y_train, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier for dataset R
dt_model_r = DecisionTreeClassifier()

# Define hyperparameters to tune for Decision Tree classifier
param_grid_r = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation for Decision Tree classifier for dataset R
grid_search_r = GridSearchCV(estimator=dt_model_r, param_grid=param_grid_r, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_r.fit(X_train_r.reshape(len(X_train_r), -1), y_train_r)

# Get the best model for Decision Tree classifier for dataset R
best_dt_model_r = grid_search_r.best_estimator_

# Evaluate the best model on the test set for Decision Tree classifier for dataset R
accuracy_r = best_dt_model_r.score(X_test_r.reshape(len(X_test_r), -1), y_test_r)
print("Model Accuracy (Decision Tree with centroids):", accuracy_r)



  pid = os.fork()


KeyboardInterrupt: 

In [None]:
import numpy as np
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from joblib import parallel_backend

# Define transformation to convert images to PyTorch tensors and normalize them
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download QMNIST dataset
train_dataset = datasets.QMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.QMNIST(root='./data', train=False, download=True, transform=transform)

# Extract features and labels from the dataset
X_train = train_dataset.data.numpy()
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy()
y_test = test_dataset.targets.numpy()

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Train KNN on each partition to find the nearest neighbors and compute centroid
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_partition.reshape(len(X_partition), -1), y_partition)
    distances, indices = knn_model.kneighbors(X_partition.reshape(len(X_partition), -1))
    centroid = np.mean(X_partition[indices], axis=1)
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets for dataset R
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(R, y_train, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier for dataset R
dt_model_r = DecisionTreeClassifier()

# Define hyperparameters to tune for Decision Tree classifier
param_grid_r = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation for Decision Tree classifier for dataset R
with parallel_backend('loky'):
    grid_search_r = GridSearchCV(estimator=dt_model_r, param_grid=param_grid_r, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_r.fit(X_train_r.reshape(len(X_train_r), -1), y_train_r)

# Get the best model for Decision Tree classifier for dataset R
best_dt_model_r = grid_search_r.best_estimator_

# Evaluate the best model on the test set for Decision Tree classifier for dataset R
accuracy_r = best_dt_model_r.score(X_test_r.reshape(len(X_test_r), -1), y_test_r)
print("Model Accuracy (Decision Tree with centroids):", accuracy_r)


  pid = os.fork()


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
import numpy as np
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define transformation to convert images to PyTorch tensors and normalize them
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download QMNIST dataset
train_dataset = datasets.QMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.QMNIST(root='./data', train=False, download=True, transform=transform)

# Extract features and labels from the dataset
X_train = train_dataset.data.numpy()
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy()
y_test = test_dataset.targets.numpy()

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Train KNN on each partition to find the nearest neighbors and compute centroid
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_partition.reshape(len(X_partition), -1), y_partition)
    distances, indices = knn_model.kneighbors(X_partition.reshape(len(X_partition), -1))
    centroid = np.mean(X_partition[indices], axis=1)
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets for dataset R
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(R, y_train[:, 0], test_size=0.2, random_state=42)

# Initialize Decision Tree classifier for dataset R
dt_model_r = DecisionTreeClassifier()

# Define hyperparameters to tune for Decision Tree classifier
param_grid_r = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation for Decision Tree classifier for dataset R
grid_search_r = GridSearchCV(estimator=dt_model_r, param_grid=param_grid_r, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_r.fit(X_train_r.reshape(len(X_train_r), -1), y_train_r)

# Get the best model for Decision Tree classifier for dataset R
best_dt_model_r = grid_search_r.best_estimator_

# Evaluate the best model on the test set for Decision Tree classifier for dataset R
accuracy_r = best_dt_model_r.score(X_test_r.reshape(len(X_test_r), -1), y_test_r)
print("Model Accuracy (Decision Tree with centroids):", accuracy_r)


KeyboardInterrupt: 

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load Kannada MNIST dataset
kannada_mnist = fetch_openml('Kannada-MNIST', version=2, as_frame=False)
X = kannada_mnist.data
y = kannada_mnist.target.astype(int)

# Continue with the rest of the code...

X = kannada_mnist.data
y = kannada_mnist.target.astype(int)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X, num_partitions)
y_partitions = np.array_split(y, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


OpenMLError: Dataset kannada-mnist with version 2 not found.

In [3]:
kaggle competitions download -c Kannada-MNIST


SyntaxError: invalid syntax (<ipython-input-3-5545ac40f572>, line 1)

In [4]:
!pip install kaggle




In [5]:
from google.colab import files

# Upload kaggle.json
uploaded = files.upload()

# Move the uploaded file to the appropriate location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


mv: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [6]:
!kaggle competitions download -c Kannada-MNIST


Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [7]:

# importing libraries for data analysis and manipulation
import pandas as pd
import numpy as np

In [8]:
np.random.seed(42) # to make this notebook's output stable across runs

# For plotting figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=15)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Generating the version of a wide variety of packages/libraries used & working environment
pd.__version__
pd.show_versions(as_json=False)




INSTALLED VERSIONS
------------------
commit           : 0f437949513225922d851e9581723d82120684a6
python           : 3.10.12.final.0
python-bits      : 64
OS               : Linux
OS-release       : 6.1.58+
Version          : #1 SMP PREEMPT_DYNAMIC Sat Nov 18 15:31:17 UTC 2023
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : en_US.UTF-8
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 2.0.3
numpy            : 1.25.2
pytz             : 2023.4
dateutil         : 2.8.2
setuptools       : 67.7.2
pip              : 23.1.2
Cython           : 3.0.10
pytest           : 7.4.4
hypothesis       : None
sphinx           : 5.0.2
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.9.4
html5lib         : 1.1
pymysql          : None
psycopg2         : 2.9.9
jinja2           : 3.1.3
IPython          : 7.34.0
pandas_datareader: 0.10.0
bs4              : 4.12.3
bottleneck       : None

In [9]:

import tensorflow as tf # open source Deep Leaning library
from tensorflow import keras # Deep Learning API
print ("Tensorflow version is {}".format(tf.__version__))
print ("Keras Version is {}".format(keras.__version__))

Tensorflow version is 2.15.0


AttributeError: module 'keras.api._v2.keras' has no attribute '__version__'

In [10]:
!pip install tensorflow



In [11]:
import keras

In [12]:
print("Keras version is:", keras.__version__)

Keras version is: 2.15.0


In [13]:

import tensorflow as tf # open source Deep Leaning library
from tensorflow import keras # Deep Learning API
print ("Tensorflow version is {}".format(tf.__version__))
print ("Keras Version is {}".format(keras.__version__))


Tensorflow version is 2.15.0


AttributeError: module 'keras.api._v2.keras' has no attribute '__version__'

In [14]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [15]:
!pip install kaggle



In [16]:

# Download MNIST data
!kaggle competitions download -c cs98x-kannada-mnist --force

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [17]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load Sign Language MNIST dataset
sign_mnist = fetch_openml('sign-language-mnist', version=1, as_frame=False)
X = sign_mnist.data
y = sign_mnist.target.astype(int)

# Preprocessing: Normalize pixel values
X = X / 255.0  # Scale pixel values to the range [0, 1]

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X, num_partitions)
y_partitions = np.array_split(y, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Define hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_dt_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


OpenMLError: Dataset sign-language-mnist with version 1 not found.