## Data cleaning

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.impute import SimpleImputer
import glob

Compute minASPL

In [None]:
def column_types(df):
    """
        Identifies numeric and categorical columns in a DataFrame.

        Parameters:
        df (pd.DataFrame): The input DataFrame for which column types are to be identified.

        Returns:
        tuple: A tuple containing:
            - num_cols (list): List of column names identified as numeric.
            - cat_cols (list): List of column names identified as categorical.
    """

    cols = df.columns
    num_cols = df._get_numeric_data().columns.tolist()
    cat_cols = list(set(cols) - set(num_cols))
    
    return num_cols, cat_cols

In [None]:
def min_aspl(data):
  """
    Computes the minASPL value based on the categorical columns in the DataFrame.

    Args:
        data (pd.DataFrame): The input DataFrame to calculate the minASPL.

    Returns:
        float: The minASPL rounded to the nearest integer.
  """
  cat_cols = column_types(data)[1]
  max_card = max(data[cat_cols].apply(lambda x: x.nunique()))
  dim = len(data)
  aspl = round(dim/max_card, 0)

  return aspl

Datasets importation

In [None]:
#Classification datasets

datasets = []

datasets.append(pd.read_csv('/data_raw/class/census.csv', na_values='?'))

datasets.append(pd.read_csv('/data_raw/class/mushrooms.csv', na_values='?'))

churn = pd.read_csv('/data_raw/class/churn.csv', na_values='?')

cols = ['\'number_customer_service_calls\'', '\'state\'', '\'international_plan\'', '\'voice_mail_plan\'', '\'number_customer_service_calls\'']
for col in cols:
    churn[col] = churn[col].astype('str')

datasets.append(churn)

datasets.append(pd.read_csv('/data_raw/class/germancredit.csv', na_values='?'))

datasets.append(pd.read_csv('/data_raw/class/breastcancer.csv', na_values='?'))

datasets.append(pd.read_csv('/data_raw/class/autism_adult.csv', na_values='?'))


datasets.append(pd.read_csv('/data_raw/class/obesity.csv', na_values='?'))

datasets.append(pd.read_csv('/data_raw/class/car.csv', na_values='?'))

cmc = pd.read_csv('/data_raw/class/cmc.csv', na_values='?')
cols = ['wife_edu', 'husband_edu', 'wife_religion',
       'wife_working', 'husband_occupation', 'standard_of_living_index',
       'media_exposure', 'target']
for col in cols:
    cmc[col] = cmc[col].astype('str')

datasets.append(cmc)

datasets.append(pd.read_csv('/data_raw/class/nursery.csv', na_values='?'))

In [None]:
#Regression datasets

filepaths = glob.glob('/data_raw/regr/*.csv')

all_dfs = [pd.read_csv(fp, na_values='?') for fp in filepaths]

#### Compute the minASPL value for each dataset

In [None]:
for name, df in zip(filepaths, datasets): #classification minASPL
  print(name, min_aspl(df))

1191.0
677.0
98.0
100.0
805.0
11.0
302.0
432.0
368.0
2592.0


In [None]:
for name, df in zip(filepaths, all_dfs): #regression minASPL
  print(name, min_aspl(df))

#### Drop useless features (manually)

In [None]:
df['target'] = df['\'runtime\'']
df.drop(['\'runtime\'', 'id'], inplace=True, axis=1)

#### Missing data

In [None]:
def missing_input(df):
    """
    Removes features with more than 50% of NaN values and impute missing values in a dataframe for both numerical and categorical columns.

    Parameters:
    df (pandas.DataFrame): The input dataframe with potential missing values.

    Returns:
    pandas.DataFrame: The dataframe with missing values imputed. Numerical columns are imputed with the mean value,
                      and categorical columns are imputed with the most frequent value.
    """
    numerical_cols, cat_cols = column_types(df)

    # Drop features with more than 50% of missing values
    cols_to_drop = df.columns[df.isnull().mean() > 0.5]
    df = df.drop(cols_to_drop, axis=1)

    # Numerical imputation with mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    df[numerical_cols] = pd.DataFrame(imp_mean.fit_transform(df[numerical_cols]), columns=numerical_cols)

    # Categorical imputation with the most frequent value
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    df[cat_cols] = pd.DataFrame(imp_mean.fit_transform(df[cat_cols]), columns=cat_cols)

    return df

In [None]:
#Missing inputation for Classification datasets
for name, df in zip(filepaths, all_dfs):
  df = missing_input(df)

In [None]:
#Missing inputation for Regression datasets
for name, df in zip(filepaths, all_dfs):
  df = missing_input(df)