# Dataset ML Classification Model

## Imports

In [2]:
import openml
import pandas as pd
import numpy as np
from scipy.sparse import issparse

In [3]:
task_types = [
    "Supervised Classification",
    "Supervised Regression",
    "Clustering",
    "Learning Curve",
    "Supervised Data Stream Classification",
    "Subgroup Discovery",
    "Survival Analysis"
]
def initialize_binary_matrix(dataset_ids, task_types):
    binary_matrix = pd.DataFrame(0, index=dataset_ids, columns=task_types)
    return binary_matrix

def populate_binary_matrix(binary_matrix, dataset_id):
    try:
        # Find all tasks associated with this dataset
        tasks = openml.tasks.list_tasks(output_format='dataframe', data_id=dataset_id)
        
        # Calculate the percentage each task type represents
        task_type_counts = tasks['task_type'].value_counts(normalize=True) * 100
        
        # Filter task types that make up at least 20% of the total tasks
        filtered_task_types = task_type_counts[task_type_counts >= 20]
        
        # If no task type meets the 20% threshold, select the top one
        if filtered_task_types.empty:
            filtered_task_types = task_type_counts.head(1)
        
        # Mark the filtered task types as 1 in the binary matrix
        for task_type in filtered_task_types.index:
            if task_type in binary_matrix.columns:
                binary_matrix.at[dataset_id, task_type] = 1
                
    except Exception as e:
        print(f"Failed to process dataset {dataset_id}: {e}")

In [4]:
def convert_to_dense(X):
    """Convert any sparse columns in DataFrame X to dense format."""
    if issparse(X):
        X = pd.DataFrame.sparse.from_spmatrix(X)
    for col in X.columns:
        if pd.api.types.is_sparse(X[col]):
            X[col] = X[col].sparse.to_dense()
    return X
    
def extract_meta_features_with_tasks(dataset_ids):
    meta_features = []
    binary_matrix = initialize_binary_matrix(dataset_ids, task_types)
    
    for dataset_id in dataset_ids:
        try:
            dataset = openml.datasets.get_dataset(
                dataset_id, 
                download_data=False,  # Do not download the actual data
                download_qualities=True,  # Download dataset qualities
                download_features_meta_data=True  # Download feature metadata
            )
            X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

            # Convert any sparse data to dense format if necessary
            X = convert_to_dense(X)
            
            # Filter out non-numeric columns
            numeric_X = X.select_dtypes(include=[np.number])
            
            num_instances = X.shape[0]
            num_features = X.shape[1]
            num_missing_values = X.isnull().sum().sum()
            target_type = 'categorical' if y.dtype.name == 'category' else 'continuous'
            num_classes = len(y.unique()) if target_type == 'categorical' else None

            # Calculate class balance only if no class has zero instances
            if target_type == 'categorical' and y.value_counts().min() > 0:
                class_balance = y.value_counts().max() / y.value_counts().min()
            else:
                class_balance = None
            
            # Statistical Features on numeric data only
            mean_features = numeric_X.mean().mean() if not numeric_X.empty else None
            std_features = numeric_X.std(ddof=0).mean() if not numeric_X.empty else None
            min_features = numeric_X.min().mean() if not numeric_X.empty else None
            max_features = numeric_X.max().mean() if not numeric_X.empty else None
            skewness_features = numeric_X.skew().mean() if not numeric_X.empty else None
            kurtosis_features = numeric_X.kurtosis().mean() if not numeric_X.empty else None
            
            # Correlation Features
            if numeric_X.shape[1] > 1:
                correlation_matrix = numeric_X.corr().abs()
                np.fill_diagonal(correlation_matrix.values, np.nan)
                mean_correlation = correlation_matrix.mean().mean()
                max_correlation = correlation_matrix.max().max()
            else:
                mean_correlation, max_correlation = None, None

            # Data Sparsity
            sparsity = (numeric_X == 0).sum().sum() / (num_instances * numeric_X.shape[1]) if not numeric_X.empty else None
            
            # Add meta-features to list
            meta_features.append({
                'dataset_id': dataset_id,
                'num_instances': num_instances,
                'num_features': num_features,
                'num_missing_values': num_missing_values,
                'target_type': target_type,
                'num_classes': num_classes,
                'class_balance': class_balance,
                'mean_features': mean_features,
                'std_features': std_features,
                'min_features': min_features,
                'max_features': max_features,
                'skewness_features': skewness_features,
                'kurtosis_features': kurtosis_features,
                'mean_correlation': mean_correlation,
                'max_correlation': max_correlation,
                'sparsity': sparsity
            })
            
            populate_binary_matrix(binary_matrix, dataset_id)
            
        except Exception as e:
            print(f"Failed to process dataset {dataset_id}: {e}")
    
    meta_features_df = pd.DataFrame(meta_features)
    combined_df = meta_features_df.set_index('dataset_id').join(binary_matrix, on='dataset_id')
    
    return combined_df


datasets = openml.datasets.list_datasets(output_format='dataframe')
dataset_ids = datasets['did'].tolist()[230:250]

combined_df = extract_meta_features_with_tasks(dataset_ids)

combined_df.to_csv("../data/openml_meta_features.csv", index=True, mode='a', header=False)

print("Meta-features extraction complete. Saved to 'openml_meta_features.csv'.")

  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):
  pd.factorize(type_)[0]
  if pd.api.types.is_sparse(X[col]):


Meta-features extraction complete. Saved to 'openml_meta_features.csv'.


In [5]:
datasets = openml.datasets.list_datasets(output_format='dataframe')

len(datasets)

5741