In [1]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import tensorflow_datasets as tfds

# Load the EMNIST dataset from TensorFlow Datasets
ds_train, ds_test = tfds.load('emnist', split=['train', 'test'], as_supervised=True)

# Convert TensorFlow Datasets to NumPy arrays
X_train, y_train = [], []
X_test, y_test = [], []

for image, label in tfds.as_numpy(ds_train):
    X_train.append(image.flatten())
    y_train.append(label)

for image, label in tfds.as_numpy(ds_test):
    X_test.append(image.flatten())
    y_test.append(label)

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Preprocess the dataset (scale features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection
rf_model = RandomForestClassifier()
selector = SelectFromModel(estimator=rf_model).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train_selected, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train_selected, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Downloading and preparing dataset 535.73 MiB (download: 535.73 MiB, generated: Unknown size, total: 535.73 MiB) to /root/tensorflow_datasets/emnist/byclass/3.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

NonMatchingChecksumError: Artifact https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip, downloaded to /root/tensorflow_datasets/downloads/itl.nist.gov_iaui_vip_cs_links_EMNIST_gzipi4VnNviDSrfd9Zju6qv40flc3wr22t8ldulNStS6tmk.zip.tmp.419eedcf07af45c4b30f4917bb0dbd29/itl, has wrong checksum:
* Expected: UrlInfo(size=535.73 MiB, checksum='fb9bb67e33772a9cc0b895e4ecf36d2cf35be8b709693c3564cea2a019fcda8e', filename='gzip.zip')
* Got: UrlInfo(size=108.38 KiB, checksum='b5ae943c7ef7c4f15589d4160ee5e1f8bd10761fe4e82c11d4c8e31d6133dd7d', filename='itl')
To debug, see: https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror

In [2]:
!rm -rf ~/.cache/tensorflow_datasets


In [4]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import tensorflow_datasets as tfds

# Load the CIFAR-10 dataset from TensorFlow Datasets
(ds_train, ds_test), ds_info = tfds.load('cifar10', split=['train', 'test'], as_supervised=True, with_info=True)

# Convert TensorFlow Datasets to NumPy arrays
X_train, y_train = [], []
X_test, y_test = [], []

for image, label in tfds.as_numpy(ds_train):
    X_train.append(image.flatten())
    y_train.append(label)

for image, label in tfds.as_numpy(ds_test):
    X_test.append(image.flatten())
    y_test.append(label)

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Preprocess the dataset (scale features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection
rf_model = RandomForestClassifier()
selector = SelectFromModel(estimator=rf_model).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train_selected, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train_selected, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Downloading and preparing dataset 162.17 MiB (download: 162.17 MiB, generated: 132.40 MiB, total: 294.58 MiB) to /root/tensorflow_datasets/cifar10/3.0.2...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/cifar10/3.0.2.incompleteC5E98A/cifar10-train.tfrecord*...:   0%|          …

Generating test examples...:   0%|          | 0/10000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/cifar10/3.0.2.incompleteC5E98A/cifar10-test.tfrecord*...:   0%|          |…

Dataset cifar10 downloaded and prepared to /root/tensorflow_datasets/cifar10/3.0.2. Subsequent calls will reuse this data.


  pid = os.fork()
  pid = os.fork()
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Cross-Validation Accuracy: 0.46668000000000004
Cross-Validation Precision: 0.4616448564366732
Cross-Validation Recall: 0.4659
Test Accuracy: 0.4739
Test Precision: 0.46732114919327744
Test Recall: 0.4739


In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier  # Using Gradient Boosting
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import tensorflow_datasets as tfds

# Load the CIFAR-10 dataset from TensorFlow Datasets
(ds_train, ds_test), ds_info = tfds.load('cifar10', split=['train', 'test'], as_supervised=True, with_info=True)

# Convert TensorFlow Datasets to NumPy arrays
X_train, y_train = [], []
X_test, y_test = [], []

for image, label in tfds.as_numpy(ds_train):
    X_train.append(image.flatten())
    y_train.append(label)

for image, label in tfds.as_numpy(ds_test):
    X_test.append(image.flatten())
    y_test.append(label)

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Preprocess the dataset (scale features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model (Gradient Boosting)
gb_model = GradientBoostingClassifier()

# Define hyperparameters to tune (reduced search space)
param_dist = {
    'n_estimators': [100, 200],  # Reduced number of estimators
    'learning_rate': [0.05, 0.1, 0.2],  # Adjusted learning rate
    'max_depth': [3, 5, 7],  # Limited depth for faster training
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train_scaled, y_train)

# Get the best model
best_gb_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_gb_model, X_train_scaled, y_train, cv=3, scoring='accuracy')
cv_precision = cross_val_score(best_gb_model, X_train_scaled, y_train, cv=3, scoring='precision_weighted')
cv_recall = cross_val_score(best_gb_model, X_train_scaled, y_train, cv=3, scoring='recall_weighted')

# Fit the best model on the full training set
best_gb_model.fit(X_train_scaled, y_train)

# Evaluate the best model on the test set
y_pred = best_gb_model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


  pid = os.fork()
