In [2]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.datasets import load_iris, load_wine, fetch_openml
import pandas as pd
import numpy as np
from tabulate import tabulate  # Import tabulate for table formatting

# Create a list to hold dataset summaries
dataset_summaries = []

# Function to summarize each dataset
def summarize_dataset(name, data, target):
    # Convert data to pandas DataFrame for easier analysis
    df = pd.DataFrame(data)
    
    # Count numerical and categorical features
    numerical_features = df.select_dtypes(include=[np.number]).shape[1]
    categorical_features = df.select_dtypes(exclude=[np.number]).shape[1]
    
    # Count the number of rows per class
    class_counts = pd.Series(target).value_counts().to_dict()
    
    return {
        "Dataset": name,
        "Number of Classes": len(class_counts),
        "Number of Features": df.shape[1],
        "Numerical Features": numerical_features,
        "Categorical Features": categorical_features,
        "Size of the Dataset": df.shape[0],
        "Rows per Class": class_counts
    }

# Function to safely load a dataset with error handling
def load_dataset(dataset_name, fetch_func, **kwargs):
    try:
        data, target = fetch_func(**kwargs, return_X_y=True)
        summary = summarize_dataset(dataset_name, data, target)
        dataset_summaries.append(summary)
    except ValueError as ve:
        print(f"ValueError while loading {dataset_name}: {ve}\n")
    except Exception as e:
        print(f"Failed to load {dataset_name}: {e}\n")

# Load and summarize all datasets
load_dataset("Iris", load_iris)
load_dataset("WBDC (Breast Cancer Wisconsin Diagnostic)", fetch_openml, data_id=1510)
load_dataset("Spambase", fetch_openml, name="spambase", version=1)
load_dataset("Heart", fetch_openml, data_id=533)
load_dataset("Glass", fetch_openml, name="glass", version=2)
load_dataset("WBC (Breast Cancer Wisconsin Original)", fetch_openml, data_id=15)
load_dataset("Ionosphere", fetch_openml, name="ionosphere", version=1)
load_dataset("Arrhythmia", fetch_openml, name="arrhythmia", version=1)
load_dataset("Multiple Features", fetch_openml, name="mfeat-factors", version=1)
load_dataset("Australian", fetch_openml, name="australian")
load_dataset("German Number (Credit Dataset)", fetch_openml, name="credit-g", version=1)
load_dataset("DNA", fetch_openml, name="dna", version=1)
load_dataset("Wine", load_wine)
load_dataset("Vehicle", fetch_openml, name="vehicle", version=2)
load_dataset("Waveform", fetch_openml, name="waveform-5000", version=2)
load_dataset("Zoo", fetch_openml, name="zoo", version=2)
load_dataset("Hillvalley", fetch_openml, name="hill-valley", version=2)
load_dataset("Sonar", fetch_openml, name="sonar", version=1)
load_dataset("Musk 1", fetch_openml, name="musk", version=1)

# Create a DataFrame from the collected summaries
df_datasets = pd.DataFrame(dataset_summaries)


# Display the DataFrame in an attractive table format with column width management
print(tabulate(df_datasets, headers='keys', tablefmt='pretty', showindex=False, 
               stralign='center', numalign='center'))

+-------------------------------------------+-------------------+--------------------+--------------------+----------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
|                  Dataset                  | Number of Classes | Number of Features | Numerical Features | Categorical Features | Size of the Dataset |                                                    Rows per Class                                                     |
+-------------------------------------------+-------------------+--------------------+--------------------+----------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
|                   Iris                    |         3         |         4          |         4          |          0           |         150         |                             