In [None]:
# This notebook preprocesses raw datasets (.arff and .csv) by performing the following steps:
# 1. Loading the data and a configuration file for data types.
# 2. Applying correct data types to columns.
# 3. Standardizing the target column (named 'outlier') to 'yes' and 'no' labels.
# 4. Normalizing numerical features to a [0, 1] scale.
# 5. Removing duplicate rows.
# 6. Downsampling the majority class ('outliers') to a maximum of 5% of the data.
# 7. Generating 10 versions of each processed dataset.
# 8. Saving the processed dataframes to a 'processed' directory.

import pandas as pd
import os
import json
from scipy.io import arff
from sklearn import preprocessing
import random
import numpy as np


def normalize_numerical_features(df: pd.DataFrame, num_cols: pd.DataFrame, cat_cols: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizes the numerical columns of a DataFrame to a [0, 1] scale using MinMaxScaler.
    Some datasets have categorical columns with numbers, but those should not be normalized.
    
    Args:
        df: The original DataFrame to preserve column order.
        num_cols: DataFrame containing only the numerical columns.
        cat_cols: DataFrame containing only the categorical columns.

    Returns:
        A DataFrame with numerical columns scaled and original column order maintained.
    """
    if not num_cols.empty:
        min_max_scaler = preprocessing.MinMaxScaler()
        
        # Scale numerical data, creating a new DataFrame with original index and column names
        scaled_values = min_max_scaler.fit_transform(num_cols)
        df_scaled_num = pd.DataFrame(scaled_values, columns=num_cols.columns, index=num_cols.index)

        # Concatenate scaled numerical columns with original categorical columns
        df_normalized = pd.concat([df_scaled_num, cat_cols], axis=1)

        # Ensure the column order is the same as the original DataFrame
        return df_normalized[df.columns]
    else:
        # Return the original DataFrame if there are no numerical columns
        return df


def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes duplicate rows from the DataFrame.
    """
    return df.drop_duplicates()


def remove_rows_with_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows with missing values, assuming '?' represents a missing value.
    Note: This function is defined but not used in the main processing pipeline.
    """
    return df.replace('?', np.nan).dropna()


def downsample_outliers(file: str, df: pd.DataFrame, percent_limit: int, versions: int) -> list:
    """
    Reduces the number of outliers to a specified percentage by random removal.
    Generates multiple versions of the downsampled dataset.

    Args:
        file: The name of the dataset file (for logging).
        df: The input DataFrame, which must contain an 'outlier' column.
        percent_limit: The maximum allowed percentage of outliers.
        versions: The number of downsampled DataFrame versions to create.

    Returns:
        A list of DataFrames, each being a downsampled version.
    """
    dataframes = []
    for i in range(versions):
        num_rows = len(df)
        outlier_indices = list(df[df['outlier'] == 'yes'].index)
        num_outliers = len(outlier_indices)
        df_copy = df.copy(deep=True)

        # Check if the percentage of outliers exceeds the limit
        if num_outliers > num_rows * (percent_limit / 100):
            target_outlier_count = int(num_rows * (percent_limit / 100))
            
            # Number of outliers to remove
            num_to_remove = num_outliers - target_outlier_count
            
            # Randomly select outliers to remove
            indices_to_remove = random.sample(outlier_indices, num_to_remove)
            
            df_copy.drop(indices_to_remove, inplace=True)
            df_copy.reset_index(drop=True, inplace=True)
            dataframes.append(df_copy)
        else:
            print(f'{file}: Already has {percent_limit}% or fewer outliers.')
            dataframes.append(df_copy)
            # If the condition is met, no need to generate more identical versions
            return dataframes
        
    return dataframes


def save_processed_dataframes(base_path: str, original_filename: str, dataframes: list):
    """
    Saves a list of dataframes to CSV files in a 'processed' subdirectory.

    Args:
        base_path: The root directory where the 'processed' folder should be.
        original_filename: The name of the original file, used for naming the new files.
        dataframes: A list of DataFrames to save.
    """
    processed_dir = os.path.join(base_path, 'processed')
    os.makedirs(processed_dir, exist_ok=True)
    
    # Sanitize filename by removing original extension if present
    file_stem = original_filename.split('.')[0]

    for i, df in enumerate(dataframes, 1):
        # Format filename as '..._v01.csv', '..._v02.csv', etc.
        output_filename = f"{file_stem}_v{i:02d}.csv"
        output_path = os.path.join(processed_dir, output_filename)
        df.to_csv(output_path, index=False)


def split_numerical_categorical(df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    Splits the DataFrame into two DataFrames: one with numerical columns and one with categorical/object columns.
    """
    num_cols = df.select_dtypes(include=np.number)
    cat_cols = df.select_dtypes(include=['object', 'category'])
    return num_cols, cat_cols


def apply_custom_dtypes(df: pd.DataFrame, dataset_name: str, config: dict) -> pd.DataFrame:
    """
    Applies data types to DataFrame columns based on a configuration dictionary.

    Args:
        df: The DataFrame to modify.
        dataset_name: The name of the dataset (without extension) to look up in the config.
        config: The dictionary containing dtype information.

    Returns:
        The DataFrame with updated dtypes.
    """
    dataset_key = dataset_name.split('.')[0]
    if dataset_key in config:
        dtypes = config[dataset_key]
        columns = df.columns
        # Handle wildcard '...' to apply one type to all columns
        if '...' in dtypes:
            dtype = dtypes[0]
            return df.astype(dict.fromkeys(columns, dtype))
        else:
            return df.astype(dict(zip(columns, dtypes)))
    print(f'Notice: Dtype configuration not found for {dataset_name}. Using inferred types.')
    return df


def standardize_outlier_labels(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardizes the 'outlier' column labels.
    It assumes the minority class is the outlier ('yes') and the majority class is the nominal ('no').
    """
    if 'outlier' not in df.columns:
        return df
        
    # Count the frequency of each value in the 'outlier' column
    value_counts = df['outlier'].value_counts()
    
    # Identify the most and least frequent values
    if len(value_counts) == 2:
        majority_val = value_counts.idxmax()
        minority_val = value_counts.idxmin()
        
        # Replace values to standardize them
        df['outlier'] = df['outlier'].replace({
            majority_val: 'no',
            minority_val: 'yes'
        })
    
    return df

In [None]:
# --- Main Processing Pipeline ---

# Define the path to the datasets
# It's recommended to use relative paths or environment variables for better portability.
DATASET_PATH = r'..\..\datasets\exp'
CONFIG_PATH = r'..\config_dataset.json'

# Load dataset type configuration
try:
    with open(CONFIG_PATH, 'r') as f:
        dataset_dtypes_config = json.load(f)
except FileNotFoundError:
    print(f"Error: Configuration file not found at {CONFIG_PATH}")
    dataset_dtypes_config = {}

# Get list of context
for context in os.listdir(DATASET_PATH):
    context_path = os.path.join(DATASET_PATH, context)

    if not os.path.isdir(context_path):
        continue
    
    # Get the list of datasets to process
    try:
        datasets = os.listdir(context_path)
    except FileNotFoundError:
        print(f"Error: Dataset directory not found at {DATASET_PATH}")
        datasets = []

    # Process each dataset
    for dataset_filename in datasets:
        print(f"Processing: {dataset_filename}")
        file_path = os.path.join(context_path, dataset_filename)
        
        # Load dataset based on file extension
        if dataset_filename.endswith('.arff'):
            data, meta = arff.loadarff(file_path)
            df = pd.DataFrame(data)
            # The last column is assumed to be the target/outlier column
            target_col_name = meta.names()[-1]
            
            # Decode byte strings, a common requirement for .arff files
            str_cols = df.select_dtypes(include=[object]).columns
            df[str_cols] = df[str_cols].apply(lambda x: x.str.decode('utf-8'))
            
            df.rename(columns={target_col_name: 'outlier'}, inplace=True)
            
        elif dataset_filename.endswith('.csv'):
            # Use 'python' engine for flexibility with separators
            df = pd.read_csv(file_path, engine='python')
        else:
            # Skip files that are not .arff or .csv
            print(f"Skipping unsupported file type: {dataset_filename}")
            continue

        # --- Preprocessing Steps ---
        
        # 1. Drop 'id' column if it exists
        if 'id' in df.columns:
            df.drop(columns=['id'], inplace=True)
            
        # 2. Apply predefined data types from config file
        df = apply_custom_dtypes(df, dataset_filename, dataset_dtypes_config)
        
        # 3. Standardize outlier labels to 'yes' (minority) and 'no' (majority)
        df = standardize_outlier_labels(df)
        
        # 4. Split into numerical and categorical columns for normalization
        num_cols, cat_cols = split_numerical_categorical(df)
        
        # 5. Normalize numerical features
        df = normalize_numerical_features(df, num_cols, cat_cols)
        
        # 6. Remove duplicate records
        df = remove_duplicates(df)
        
        # 7. Downsample outliers to a max of 5%, creating 10 versions
        df.rename(columns={df.columns[-1]: 'outlier'}, inplace=True)
        processed_versions = downsample_outliers(dataset_filename, df, percent_limit=5, versions=10)
        
        # 8. Save the processed dataframes
        save_processed_dataframes(context_path + '/processed', dataset_filename, processed_versions)

print("\nProcessing complete.")

In [None]:
df_ = pd.read_csv(r'C:\Users\pipip\Downloads\CategoricalDatasets-main\CategoricalDatasets\files\datasets\exp\finance\crx.csv', engine='python')
df_