In [2]:
#%pip install --upgrade clearml

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from clearml import Dataset

from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [4]:
filename = 'data/raw/'

In [5]:
np.random.seed(42)
import random
random.seed(42)

Пайплайн предобработки данных:
1) Определение характеристик данных: типов данных
2) сведение к бинарной классификации
3) Очистка данных: удаление дубликатов, обработка пропущенных значений: k-NN Imputer для численных, модальную для категориальных
4) Детекция выбросов Z-score
5) Кодирование категориальных переменных: one-hot для номинальных; label для порядковых;
6) MinMax scaling

Для каждого датасета из списка используем пайплайн предобработки.

Если необходимо точное воспроезвидение экспериментов из статьи, то необходима более тонкая предобработка. Поэтому была выбрана стратегия общей обработки данных для возможности оценивать различные методы на одинаковых датасетах. 

Сведение к бинарной классификации выбрано для возможности сравнивать методы, так как не все методы могут работать с мультиклассовостью.

In [None]:
dataset_project = 'Smote Test Bench'

datasets_config = {
'Adult': '77e07677143e49a39b4e4f4303099dde',
'Forest coverage': 'eae8452f680048eb81e62021ad94a151',
'Haberman':'b32930d5c70e48e5a38ca75d0e6ed632',
'Ionosphere':'f0c4d8df1c5443fcbd54fc7bc5d77ddc',
'Mammography':'23f0b97e9adc4203829fbf9b9282ebca',
'Oil':'e528061a1d304e5ab313f9e6ad80e2d1',
'Phoneme':'aef842d63d154b13a02601e543fe8baf', # +
'Pima diabetes':'18a64325a8304b5290c70a06b2c1cb68',
'Satimage':'60d6c0e2e51b43929257b113e742f96f',
'Vehicle':'a0dc9a57c5cb492f8b4cec206d64365b',
}

Для работы с новым датасетом: 
1) Сначала датасет загружается на ClearML
2) Добавляется в datasets_config название и айди датасета
3) Предобрабатывается и добавляется новой версией к старому датасету
4) Обработанный датасет добавляется в configs/datasets.yaml

In [7]:
datasets = list(datasets_config)

In [8]:
def fetch_dataset(dataset_name: str, max_workers: int = 8):
    dataset_id = datasets_config[dataset_name]

    if not dataset_id:
        raise ValueError(
            f"Dataset ID not found for '{dataset_name}'. "
            f"Check datasets.yaml configuration."
        )

    try:
        dataset = Dataset.get(dataset_id=dataset_id)
        local_path = dataset.get_local_copy(max_workers=max_workers)
        data = pd.read_csv(f"{local_path}/{dataset_name}.csv")

        metadata = dataset.get_metadata()
        return data, metadata

    except Exception as e:
        print('ai')
        raise


In [9]:
scaler = MinMaxScaler()

In [10]:
def general_imputer(X_clean, numerical_cols, categorical_cols):
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    X_numerical = X_clean[numerical_cols].copy()
    X_imputed = imputer.fit_transform(X_numerical)
    X_clean[numerical_cols] = X_imputed

    for col in categorical_cols:
        mode_value = X_clean[col].mode()[0]
        X_clean[col].fillna(mode_value, inplace=True)

    print(f"After imputation - Missing values: {X_clean.isnull().sum().sum()}")

    return X_clean

In [11]:
def handle_outliers(X, y, continuous_features):
    outlier_mask = np.ones(len(X), dtype=bool)
    # Z-score 
    for col in continuous_features:
        z_scores = np.abs(stats.zscore(X[col]))
        outlier_mask &= z_scores < 3

    if not np.all(outlier_mask):
        X_processed = X[outlier_mask]
        y_processed = y[outlier_mask]
        return X_processed, y_processed
    return X, y

In [12]:
def get_continuous(X, numeric_cols):
    threshold_ratio = 0.1   
    con_cols = []
    for col in numeric_cols:
        unique = X[col].nunique()
        ratio = unique / len(X)
        if ratio > threshold_ratio:
            con_cols.append(col)
    
    return con_cols

 Phoneme Dataset

In [13]:
phoneme_data, phoneme_metadata = fetch_dataset('Phoneme')
print(phoneme_data, phoneme_metadata)

         Aa     Ao    Dcl     Iy     Sh   Class
0     1.240  0.875 -0.205 -0.078  0.067       0
1     0.268  1.352  1.035 -0.332  0.217       0
2     1.567  0.867  1.300  1.041  0.559       0
3     0.279  0.990  2.555 -0.738  0.000       0
4     0.307  1.272  2.656 -0.946 -0.467       0
...     ...    ...    ...    ...    ...     ...
5399  0.254  2.392  0.689  1.828 -0.544       0
5400  0.781  1.250  0.793  0.383  0.816       1
5401  1.031  0.584  1.866  1.532 -0.671       1
5402  0.150  0.933  2.363 -0.742 -0.617       0
5403  0.137  0.714  1.350  0.972 -0.630       1

[5404 rows x 6 columns] {'classes': 2, 'features': 5, 'target': 'Class', 'total_samples': 5404}


In [14]:
# Base view
X_phoneme = phoneme_data.drop(' Class', axis=1)
y_phoneme = phoneme_data[' Class']

print(f"\nMissing values:\n{X_phoneme.isnull().sum()}")

categorical_cols = X_phoneme.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_phoneme.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_phoneme, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {phoneme_metadata['features']}")

# Требует ручной проверки
print("Continuous cols : ", continuous_cols)


Missing values:
Aa      0
 Ao     0
 Dcl    0
 Iy     0
 Sh     0
dtype: int64

Categorical: 0, Numerical: 5, All: 5
Continuous cols :  ['Aa', ' Ao', ' Dcl', ' Iy', ' Sh']


In [15]:
# drop duplicates
X_phoneme_clean = X_phoneme.drop_duplicates()
y_phoneme_clean = y_phoneme[X_phoneme_clean.index]
print(len(X_phoneme_clean))

# drop rows with 40% missin values
missing_threshold = 0.4
valid_rows = (X_phoneme_clean.isnull().sum(axis=1) / X_phoneme_clean.shape[1]) < missing_threshold
X_phoneme_clean = X_phoneme_clean[valid_rows]
y_phoneme_clean = y_phoneme_clean[valid_rows]
print(len(X_phoneme_clean))

#print(pd.concat([X_phoneme_clean, y_phoneme_clean], axis=1))

5349
5349


In [16]:
# impute missin values
X_phoneme_clean = general_imputer(X_phoneme_clean, numerical_cols, categorical_cols)

# handle outliers 
X_phoneme_clean_out, y_phoneme_clean_out = handle_outliers(X_phoneme_clean, y_phoneme_clean, continuous_cols)

X_phoneme_clean_out = X_phoneme_clean_out.reset_index().drop('index', axis=1)
y_phoneme_clean_out = y_phoneme_clean_out.reset_index().drop('index', axis=1)

print(pd.concat([X_phoneme_clean_out, y_phoneme_clean_out], axis=1))

After imputation - Missing values: 0
         Aa     Ao    Dcl     Iy     Sh   Class
0     1.240  0.875 -0.205 -0.078  0.067       0
1     0.268  1.352  1.035 -0.332  0.217       0
2     1.567  0.867  1.300  1.041  0.559       0
3     0.279  0.990  2.555 -0.738  0.000       0
4     0.307  1.272  2.656 -0.946 -0.467       0
...     ...    ...    ...    ...    ...     ...
5194  0.254  2.392  0.689  1.828 -0.544       0
5195  0.781  1.250  0.793  0.383  0.816       1
5196  1.031  0.584  1.866  1.532 -0.671       1
5197  0.150  0.933  2.363 -0.742 -0.617       0
5198  0.137  0.714  1.350  0.972 -0.630       1

[5199 rows x 6 columns]


Нет категориальных признаков

In [17]:
# MinMax scale
X_phoneme_scaled = scaler.fit_transform(X_phoneme_clean_out, y_phoneme_clean_out)
X_phoneme_scaled = pd.DataFrame(X_phoneme_scaled, columns=X_phoneme_clean_out.columns)
print(X_phoneme_scaled.shape)
y_phoneme_scaled = y_phoneme_clean_out

# Verify scaling
print(f"After scaling - Min: {X_phoneme_scaled.min().min():.2f}, Max: {X_phoneme_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_phoneme_scaled.describe().loc[['min', 'max']]}")

(5199, 5)
After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
      Aa   Ao   Dcl   Iy   Sh
min  0.0  0.0   0.0  0.0  0.0
max  1.0  1.0   1.0  1.0  1.0


In [18]:
phoneme_data_new = pd.concat([X_phoneme_scaled, y_phoneme_scaled], axis=1)

dataset_name = 'Phoneme'
dataset_path = f'{dataset_name}.csv'
phoneme_data_new.to_csv(dataset_path, index=False)

In [19]:
dataset = Dataset.create(dataset_name=dataset_name,
                         dataset_project=dataset_project,
                         parent_datasets=[datasets_config[dataset_name]]) 

dataset.sync_folder(
    local_path=dataset_path
)

dataset.finalize(auto_upload=True)

ClearML results page: https://app.clear.ml/projects/1aa8394c881d42449a51ab796db605dc/experiments/6128c53b1b4f400c9abe6ce4469549ff/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/1aa8394c881d42449a51ab796db605dc/experiments/6128c53b1b4f400c9abe6ce4469549ff


Could not repair dependency graph. Error is: 'NoneType' object has no attribute 'values'


Pending uploads, starting dataset upload to https://files.clear.ml
Uploading dataset changes (1 files compressed to 137.21 KiB) to https://files.clear.ml
File compression and upload completed: total size 137.21 KiB, 1 chunk(s) stored (average size 137.21 KiB)


True

Adult Dataset

In [20]:
adult_data, adult_metadata = fetch_dataset('Adult')
print(adult_data, adult_metadata)

       age          workclass   fnlwgt   education   education-num  \
0       39          State-gov    77516   Bachelors              13   
1       50   Self-emp-not-inc    83311   Bachelors              13   
2       38            Private   215646     HS-grad               9   
3       53            Private   234721        11th               7   
4       28            Private   338409   Bachelors              13   
...    ...                ...      ...         ...             ...   
48837   39            Private   215419   Bachelors              13   
48838   64                  ?   321403     HS-grad               9   
48839   38            Private   374983   Bachelors              13   
48840   44            Private    83891   Bachelors              13   
48841   35       Self-emp-inc   182148   Bachelors              13   

            martial-status          occupation     relationship  \
0            Never-married        Adm-clerical    Not-in-family   
1       Married-civ-spous

Haberman Dataset

In [21]:
haberman_data, haberman_metadata = fetch_dataset('Haberman')
print(haberman_data, haberman_metadata)

      1   2   3   4
0    30  64   1   1
1    30  62   3   1
2    30  65   0   1
3    31  59   2   1
4    31  65   4   1
..   ..  ..  ..  ..
301  75  62   1   1
302  76  67   0   1
303  77  65   3   1
304  78  65   1   2
305  83  58   2   2

[306 rows x 4 columns] {'classes': 2, 'features': 3, 'target': '4', 'total_samples': 306}


Ionosphere Dataset

In [22]:
Ionosphere_data, Ionosphere_metadata = fetch_dataset('Ionosphere')
print(Ionosphere_data, Ionosphere_metadata)

     a01  a02      a03      a04      a05      a06      a07      a08      a09  \
0      1    0  0.99539 -0.05889  0.85243  0.02306  0.83398 -0.37708  1.00000   
1      1    0  1.00000 -0.18829  0.93035 -0.36156 -0.10868 -0.93597  1.00000   
2      1    0  1.00000 -0.03365  1.00000  0.00485  1.00000 -0.12062  0.88965   
3      1    0  1.00000 -0.45161  1.00000  1.00000  0.71216 -1.00000  0.00000   
4      1    0  1.00000 -0.02401  0.94140  0.06531  0.92106 -0.23255  0.77152   
..   ...  ...      ...      ...      ...      ...      ...      ...      ...   
346    1    0  0.83508  0.08298  0.73739 -0.14706  0.84349 -0.05567  0.90441   
347    1    0  0.95113  0.00419  0.95183 -0.02723  0.93438 -0.01920  0.94590   
348    1    0  0.94701 -0.00034  0.93207 -0.03227  0.95177 -0.03431  0.95584   
349    1    0  0.90608 -0.01657  0.98122 -0.01989  0.95691 -0.03646  0.85746   
350    1    0  0.84710  0.13533  0.73638 -0.06151  0.87873  0.08260  0.88928   

         a10  ...      a26      a27    

Forest_coverage Dataset

In [23]:
Forest_coverage_data, Forest_coverage_metadata = fetch_dataset('Forest coverage')
print(Forest_coverage_data, Forest_coverage_metadata)

        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0            2596      51      3                               258   
1            2590      56      2                               212   
2            2804     139      9                               268   
3            2785     155     18                               242   
4            2595      45      2                               153   
...           ...     ...    ...                               ...   
581007       2396     153     20                                85   
581008       2391     152     19                                67   
581009       2386     159     17                                60   
581010       2384     170     15                                60   
581011       2383     165     13                                60   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                                    0                              510   
1        