In [1]:
#%pip install --upgrade clearml

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from clearml import Dataset

from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
filename = 'data/raw/'

In [4]:
np.random.seed(42)
import random
random.seed(42)

Пайплайн предобработки данных:
1) Определение характеристик данных: типов данных
2) сведение к бинарной классификации
3) Очистка данных: удаление дубликатов, обработка пропущенных значений: k-NN Imputer для численных, модальную для категориальных
4) Детекция выбросов Z-score
5) Кодирование категориальных переменных: one-hot для номинальных; label для порядковых;
6) MinMax scaling

Для каждого датасета из списка используем пайплайн предобработки.

Если необходимо точное воспроезвидение экспериментов из статьи, то необходима более тонкая предобработка. Поэтому была выбрана стратегия общей обработки данных для возможности оценивать различные методы на одинаковых датасетах. 

Сведение к бинарной классификации выбрано для возможности сравнивать методы, так как не все методы могут работать с мультиклассовостью.

In [100]:
dataset_project = 'Smote Test Bench'

datasets_config = {
'Adult': '77e07677143e49a39b4e4f4303099dde', # +
'Forest coverage': 'eae8452f680048eb81e62021ad94a151', # +
'Haberman':'b32930d5c70e48e5a38ca75d0e6ed632', # +
'Ionosphere':'f0c4d8df1c5443fcbd54fc7bc5d77ddc', # +
'Mammography':'23f0b97e9adc4203829fbf9b9282ebca', # +
'Oil':'e528061a1d304e5ab313f9e6ad80e2d1', # +
'Phoneme':'aef842d63d154b13a02601e543fe8baf', # +
'Pima_Diabetes':'18a64325a8304b5290c70a06b2c1cb68', # +
'Satimage':'60d6c0e2e51b43929257b113e742f96f', # +
'Vehicle':'a0dc9a57c5cb492f8b4cec206d64365b', # ?
'Abalone': '5b7caf44a9a44fbbb26a3035e2b1644c', # +
'Ecoli': 'd4ad0d71a8f14702b6e0d29aa2ea518c', # +
'us_crime': '3e622b482d124c2f8eeeab514f1ec376', # +
'yeast_ml8': '9f34f42a22fc43029c21908ae262712c',
}

Для работы с новым датасетом: 
1) Сначала датасет загружается на ClearML
2) Добавляется в datasets_config название и айди датасета
3) Предобрабатывается и добавляется новой версией к старому датасету
4) Обработанный датасет добавляется в configs/datasets.yaml

In [6]:
datasets = list(datasets_config)

In [7]:
def fetch_dataset(dataset_name: str, max_workers: int = 8):
    dataset_id = datasets_config[dataset_name]

    if not dataset_id:
        raise ValueError(
            f"Dataset ID not found for '{dataset_name}'. "
            f"Check datasets.yaml configuration."
        )

    try:
        dataset = Dataset.get(dataset_id=dataset_id)
        local_path = dataset.get_local_copy(max_workers=max_workers)
        data = pd.read_csv(f"{local_path}/{dataset_name}.csv")

        metadata = dataset.get_metadata()
        return data, metadata

    except Exception as e:
        print('ai')
        raise


In [8]:
scaler = MinMaxScaler()

In [9]:
def general_imputer(X_clean, numerical_cols, categorical_cols):
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    X_numerical = X_clean[numerical_cols].copy()
    X_imputed = imputer.fit_transform(X_numerical)
    X_clean[numerical_cols] = X_imputed

    for col in categorical_cols:
        mode_value = X_clean[col].mode()[0]
        X_clean.fillna({col: mode_value}, inplace=True)

    print(f"After imputation - Missing values: {X_clean.isnull().sum().sum()}")

    return X_clean

In [10]:
def handle_outliers(X, y, continuous_features):
    outlier_mask = np.ones(len(X), dtype=bool)
    # Z-score 
    for col in continuous_features:
        z_scores = np.abs(stats.zscore(X[col]))
        outlier_mask &= z_scores < 3

    if not np.all(outlier_mask):
        X_processed = X[outlier_mask]
        y_processed = y[outlier_mask]
        return X_processed, y_processed
    return X, y

In [11]:
def get_continuous(X, numeric_cols):
    threshold_ratio = 0.1   
    con_cols = []
    for col in numeric_cols:
        unique = X[col].nunique()
        ratio = unique / len(X)
        if ratio > threshold_ratio:
            con_cols.append(col)
    
    return con_cols

In [12]:
def upload_dataset(data, dataset_name, dataset_project):
    
    dataset_path = f'{dataset_name}.csv'
    data.to_csv(dataset_path, index=False)

    dataset = Dataset.create(dataset_name=dataset_name,
                            dataset_project=dataset_project,
                            parent_datasets=[datasets_config[dataset_name]]) 

    dataset.sync_folder(
        local_path=dataset_path
    )

    dataset.finalize(auto_upload=True)

 Phoneme Dataset

In [13]:
phoneme_data, phoneme_metadata = fetch_dataset('Phoneme')
print(phoneme_data, phoneme_metadata)

         Aa     Ao    Dcl     Iy     Sh   Class
0     1.240  0.875 -0.205 -0.078  0.067       0
1     0.268  1.352  1.035 -0.332  0.217       0
2     1.567  0.867  1.300  1.041  0.559       0
3     0.279  0.990  2.555 -0.738  0.000       0
4     0.307  1.272  2.656 -0.946 -0.467       0
...     ...    ...    ...    ...    ...     ...
5399  0.254  2.392  0.689  1.828 -0.544       0
5400  0.781  1.250  0.793  0.383  0.816       1
5401  1.031  0.584  1.866  1.532 -0.671       1
5402  0.150  0.933  2.363 -0.742 -0.617       0
5403  0.137  0.714  1.350  0.972 -0.630       1

[5404 rows x 6 columns] {'classes': 2, 'features': 5, 'target': 'Class', 'total_samples': 5404}


In [14]:
# Base view
X_phoneme = phoneme_data.drop(' Class', axis=1)
y_phoneme = phoneme_data[' Class']

print(f"\nMissing values:\n{X_phoneme.isnull().sum()}")

categorical_cols = X_phoneme.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_phoneme.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_phoneme, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {phoneme_metadata['features']}")

# Требует ручной проверки
print("Continuous cols : ", continuous_cols)


Missing values:
Aa      0
 Ao     0
 Dcl    0
 Iy     0
 Sh     0
dtype: int64

Categorical: 0, Numerical: 5, All: 5
Continuous cols :  ['Aa', ' Ao', ' Dcl', ' Iy', ' Sh']


In [15]:
# drop duplicates
X_phoneme_clean = X_phoneme.drop_duplicates()
y_phoneme_clean = y_phoneme[X_phoneme_clean.index]
print(len(X_phoneme_clean))

# drop rows with 40% missin values
missing_threshold = 0.4
valid_rows = (X_phoneme_clean.isnull().sum(axis=1) / X_phoneme_clean.shape[1]) < missing_threshold
X_phoneme_clean = X_phoneme_clean[valid_rows]
y_phoneme_clean = y_phoneme_clean[valid_rows]
print(len(X_phoneme_clean))

#print(pd.concat([X_phoneme_clean, y_phoneme_clean], axis=1))

5349
5349


In [16]:
# impute missin values
X_phoneme_clean = general_imputer(X_phoneme_clean, numerical_cols, categorical_cols)

# handle outliers 
X_phoneme_clean_out, y_phoneme_clean_out = handle_outliers(X_phoneme_clean, y_phoneme_clean, continuous_cols)

X_phoneme_clean_out = X_phoneme_clean_out.reset_index().drop('index', axis=1)
y_phoneme_clean_out = y_phoneme_clean_out.reset_index().drop('index', axis=1)

print(pd.concat([X_phoneme_clean_out, y_phoneme_clean_out], axis=1))

After imputation - Missing values: 0
         Aa     Ao    Dcl     Iy     Sh   Class
0     1.240  0.875 -0.205 -0.078  0.067       0
1     0.268  1.352  1.035 -0.332  0.217       0
2     1.567  0.867  1.300  1.041  0.559       0
3     0.279  0.990  2.555 -0.738  0.000       0
4     0.307  1.272  2.656 -0.946 -0.467       0
...     ...    ...    ...    ...    ...     ...
5194  0.254  2.392  0.689  1.828 -0.544       0
5195  0.781  1.250  0.793  0.383  0.816       1
5196  1.031  0.584  1.866  1.532 -0.671       1
5197  0.150  0.933  2.363 -0.742 -0.617       0
5198  0.137  0.714  1.350  0.972 -0.630       1

[5199 rows x 6 columns]


Нет категориальных признаков

In [17]:
# MinMax scale
X_phoneme_scaled = scaler.fit_transform(X_phoneme_clean_out, y_phoneme_clean_out)
X_phoneme_scaled = pd.DataFrame(X_phoneme_scaled, columns=X_phoneme_clean_out.columns)
print(X_phoneme_scaled.shape)
y_phoneme_scaled = y_phoneme_clean_out

# Verify scaling
print(f"After scaling - Min: {X_phoneme_scaled.min().min():.2f}, Max: {X_phoneme_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_phoneme_scaled.describe().loc[['min', 'max']]}")

(5199, 5)
After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
      Aa   Ao   Dcl   Iy   Sh
min  0.0  0.0   0.0  0.0  0.0
max  1.0  1.0   1.0  1.0  1.0


In [18]:
phoneme_data_new = pd.concat([X_phoneme_scaled, y_phoneme_scaled], axis=1)

In [19]:
# upload_dataset(phoneme_data_new, 'Phoneme', dataset_project)

Adult Dataset

In [20]:
adult_data, adult_metadata = fetch_dataset('Adult')
print(adult_data, adult_metadata)

       age          workclass   fnlwgt   education   education-num  \
0       39          State-gov    77516   Bachelors              13   
1       50   Self-emp-not-inc    83311   Bachelors              13   
2       38            Private   215646     HS-grad               9   
3       53            Private   234721        11th               7   
4       28            Private   338409   Bachelors              13   
...    ...                ...      ...         ...             ...   
48837   39            Private   215419   Bachelors              13   
48838   64                  ?   321403     HS-grad               9   
48839   38            Private   374983   Bachelors              13   
48840   44            Private    83891   Bachelors              13   
48841   35       Self-emp-inc   182148   Bachelors              13   

            martial-status          occupation     relationship  \
0            Never-married        Adm-clerical    Not-in-family   
1       Married-civ-spous

In [21]:
# Base view
X_adult = adult_data.drop(' class', axis=1)
y_adult = adult_data[' class']

print(f"\nMissing values:\n{X_adult.isnull().sum()}")

categorical_cols = X_adult.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_adult.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_adult, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {adult_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

continuous_cols.append(' capital-gain')
continuous_cols.append(' capital-loss')
print("Continuous cols : ", continuous_cols)


Missing values:
age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 martial-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
dtype: int64

Categorical: 8, Numerical: 6, All: 14
Categorial cols :  [' workclass', ' education', ' martial-status', ' occupation', ' relationship', ' race', ' sex', ' native-country']
Numerical cols :  ['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss', ' hours-per-week']
Continuous cols :  [' fnlwgt', ' capital-gain', ' capital-loss']


In [22]:
X_adult = X_adult.replace(' ?', np.nan)
y_adult = y_adult.replace(' >50K.', ' >50K')
y_adult = y_adult.replace(' <=50K.', ' <=50K')

# drop duplicates
X_adult_clean = X_adult.drop_duplicates()
y_adult_clean = y_adult[X_adult_clean.index]
print(len(X_adult_clean))

# drop rows with 40% missin values
missing_threshold = 0.4
valid_rows = (X_adult_clean.isnull().sum(axis=1) / X_adult_clean.shape[1]) < missing_threshold
X_adult_clean = X_adult_clean[valid_rows]
y_adult_clean = y_adult_clean[valid_rows]
print(len(X_adult_clean))

#print(pd.concat([X_adult_clean, y_adult_clean], axis=1))

48785
48785


In [23]:
# impute missin values
X_adult_clean = general_imputer(X_adult_clean, numerical_cols, categorical_cols)

# handle outliers  
# X_adult_clean_out, y_adult_clean_out = handle_outliers(X_adult_clean, y_adult_clean, continuous_cols)
# В данном случае кажется странным удаление выбросов, потому что очень много нулевых значений в ячейках с непрервывными признаками

X_adult_clean_out = X_adult_clean.reset_index().drop('index', axis=1)
y_adult_clean_out = y_adult_clean.reset_index().drop('index', axis=1)

#print(pd.concat([X_adult_clean_out, y_adult_clean_out], axis=1))

After imputation - Missing values: 0


In [24]:
y_adult_clean.unique()

array([' <=50K', ' >50K'], dtype=object)

# label : workclass, education, occupation, class
# onehot : martial-status, relationship, race, sex 

In [25]:
le = LabelEncoder()
oh = OneHotEncoder(sparse_output=False, drop='first')

In [26]:
X_adult_clean_out[' workclass'] = le.fit_transform(X_adult_clean_out[' workclass'])
X_adult_clean_out[' education'] = le.fit_transform(X_adult_clean_out[' education'])
X_adult_clean_out[' occupation'] = le.fit_transform(X_adult_clean_out[' occupation'])
y_adult_clean_out[' class'] = le.fit_transform(y_adult_clean_out[' class'])

X_adult_clean_out = X_adult_clean_out.drop(' native-country', axis=1)

#print(pd.concat([X_adult_clean_out, y_adult_clean_out], axis=1))

In [27]:
one_hot_cols = [' martial-status', ' relationship', ' race', ' sex']

X_adult_clean_out = pd.get_dummies(data=X_adult_clean_out, columns=one_hot_cols)

#print(pd.concat([X_adult_clean_out, y_adult_clean_out], axis=1))

In [28]:
scale_cols = []

for col in X_adult_clean_out.columns:
    if col not in [' workclass', ' education', ' occupation']:
        scale_cols.append(col)

In [29]:
# MinMax scale
X_adult_scaled = scaler.fit_transform(X_adult_clean_out[scale_cols], y_adult_clean_out)
X_adult_scaled = pd.DataFrame(X_adult_scaled, columns=scale_cols)
X_adult_scaled = pd.concat([X_adult_scaled, X_adult_clean_out[' workclass']], axis=1)
X_adult_scaled = pd.concat([X_adult_scaled, X_adult_clean_out[' education']], axis=1)
X_adult_scaled = pd.concat([X_adult_scaled, X_adult_clean_out[' occupation']], axis=1)
print(X_adult_scaled.shape)
y_adult_scaled = y_adult_clean_out

# Verify scaling
print(f"After scaling - Min: {X_adult_scaled.min().min():.2f}, Max: {X_adult_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_adult_scaled.describe().loc[['min', 'max']]}")

(48785, 29)
After scaling - Min: 0.00, Max: 15.00
Feature ranges check:
     age   fnlwgt   education-num   capital-gain   capital-loss  \
min  0.0      0.0             0.0            0.0            0.0   
max  1.0      1.0             1.0            1.0            1.0   

      hours-per-week   martial-status_ Divorced  \
min              0.0                        0.0   
max              1.0                        1.0   

      martial-status_ Married-AF-spouse   martial-status_ Married-civ-spouse  \
min                                 0.0                                  0.0   
max                                 1.0                                  1.0   

      martial-status_ Married-spouse-absent  ...   race_ Amer-Indian-Eskimo  \
min                                     0.0  ...                        0.0   
max                                     1.0  ...                        1.0   

      race_ Asian-Pac-Islander   race_ Black   race_ Other   race_ White  \
min              

In [30]:
adult_data_new = pd.concat([X_adult_scaled, y_adult_scaled], axis=1)

#upload_dataset(adult_data_new, 'Adult', dataset_project)

Haberman Dataset

In [31]:
haberman_data, haberman_metadata = fetch_dataset('Haberman')
print(haberman_data, haberman_metadata)

      1   2   3   4
0    30  64   1   1
1    30  62   3   1
2    30  65   0   1
3    31  59   2   1
4    31  65   4   1
..   ..  ..  ..  ..
301  75  62   1   1
302  76  67   0   1
303  77  65   3   1
304  78  65   1   2
305  83  58   2   2

[306 rows x 4 columns] {'classes': 2, 'features': 3, 'target': '4', 'total_samples': 306}


In [32]:
# Base view
X_hab = haberman_data.drop(' 4', axis=1)
y_hab = haberman_data[' 4']

print(f"\nMissing values:\n{X_hab.isnull().sum()}")

categorical_cols = X_hab.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_hab.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_hab, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {haberman_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
1     0
 2    0
 3    0
dtype: int64

Categorical: 0, Numerical: 3, All: 3
Categorial cols :  []
Numerical cols :  ['1', ' 2', ' 3']
Continuous cols :  ['1', ' 3']


In [33]:
# drop duplicates
X_hab_clean = X_hab.drop_duplicates()
y_hab_clean = y_hab[X_hab_clean.index]
print(len(X_hab_clean))

# drop rows with 35% missin values
missing_threshold = 0.35
valid_rows = (X_hab_clean.isnull().sum(axis=1) / X_hab_clean.shape[1]) < missing_threshold
X_hab_clean = X_hab_clean[valid_rows]
y_hab_clean = y_hab_clean[valid_rows]
print(len(X_hab_clean))

#print(pd.concat([X_hab_clean, y_hab_clean], axis=1))

283
283


In [34]:
# impute missin values
X_hab_clean = general_imputer(X_hab_clean, numerical_cols, categorical_cols)

X_hab_clean_out = X_hab_clean.reset_index().drop('index', axis=1)
y_hab_clean_out = y_hab_clean.reset_index().drop('index', axis=1)

y_hab_clean_out = y_hab_clean_out.replace(to_replace= 1, value=0)
y_hab_clean_out = y_hab_clean_out.replace(to_replace= 2, value=1)

print(pd.concat([X_hab_clean_out, y_hab_clean_out], axis=1))

After imputation - Missing values: 0
        1     2    3   4
0    30.0  64.0  1.0   0
1    30.0  62.0  3.0   0
2    30.0  65.0  0.0   0
3    31.0  59.0  2.0   0
4    31.0  65.0  4.0   0
..    ...   ...  ...  ..
278  75.0  62.0  1.0   0
279  76.0  67.0  0.0   0
280  77.0  65.0  3.0   0
281  78.0  65.0  1.0   1
282  83.0  58.0  2.0   1

[283 rows x 4 columns]


нет категориальных признаков

In [35]:
# MinMax scale
X_hab_scaled = scaler.fit_transform(X_hab_clean_out, y_hab_clean_out)
X_hab_scaled = pd.DataFrame(X_hab_scaled, columns=X_hab_clean_out.columns)
print(X_hab_scaled.shape)
y_hab_scaled = y_hab_clean_out

# Verify scaling
print(f"After scaling - Min: {X_hab_scaled.min().min():.2f}, Max: {X_hab_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_hab_scaled.describe().loc[['min', 'max']]}")

(283, 3)
After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
       1    2    3
min  0.0  0.0  0.0
max  1.0  1.0  1.0


In [36]:
hab_data_new = pd.concat([X_hab_scaled, y_hab_scaled], axis=1)

#upload_dataset(hab_data_new, 'Haberman', dataset_project)

Ionosphere Dataset

In [37]:
Ionosphere_data, Ionosphere_metadata = fetch_dataset('Ionosphere')
print(Ionosphere_data, Ionosphere_metadata)

     a01  a02      a03      a04      a05      a06      a07      a08      a09  \
0      1    0  0.99539 -0.05889  0.85243  0.02306  0.83398 -0.37708  1.00000   
1      1    0  1.00000 -0.18829  0.93035 -0.36156 -0.10868 -0.93597  1.00000   
2      1    0  1.00000 -0.03365  1.00000  0.00485  1.00000 -0.12062  0.88965   
3      1    0  1.00000 -0.45161  1.00000  1.00000  0.71216 -1.00000  0.00000   
4      1    0  1.00000 -0.02401  0.94140  0.06531  0.92106 -0.23255  0.77152   
..   ...  ...      ...      ...      ...      ...      ...      ...      ...   
346    1    0  0.83508  0.08298  0.73739 -0.14706  0.84349 -0.05567  0.90441   
347    1    0  0.95113  0.00419  0.95183 -0.02723  0.93438 -0.01920  0.94590   
348    1    0  0.94701 -0.00034  0.93207 -0.03227  0.95177 -0.03431  0.95584   
349    1    0  0.90608 -0.01657  0.98122 -0.01989  0.95691 -0.03646  0.85746   
350    1    0  0.84710  0.13533  0.73638 -0.06151  0.87873  0.08260  0.88928   

         a10  ...      a26      a27    

In [38]:
# Base view
X_Ionosphere = Ionosphere_data.drop('target', axis=1)
y_Ionosphere = Ionosphere_data['target']

print(f"\nMissing values:\n{X_Ionosphere.isnull().sum()}")

categorical_cols = X_Ionosphere.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Ionosphere.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Ionosphere, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Ionosphere_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
a01    0
a02    0
a03    0
a04    0
a05    0
a06    0
a07    0
a08    0
a09    0
a10    0
a11    0
a12    0
a13    0
a14    0
a15    0
a16    0
a17    0
a18    0
a19    0
a20    0
a21    0
a22    0
a23    0
a24    0
a25    0
a26    0
a27    0
a28    0
a29    0
a30    0
a31    0
a32    0
a33    0
a34    0
dtype: int64

Categorical: 0, Numerical: 34, All: 34
Categorial cols :  []
Numerical cols :  ['a01', 'a02', 'a03', 'a04', 'a05', 'a06', 'a07', 'a08', 'a09', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20', 'a21', 'a22', 'a23', 'a24', 'a25', 'a26', 'a27', 'a28', 'a29', 'a30', 'a31', 'a32', 'a33', 'a34']
Continuous cols :  ['a03', 'a04', 'a05', 'a06', 'a07', 'a08', 'a09', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20', 'a21', 'a22', 'a23', 'a24', 'a25', 'a26', 'a27', 'a28', 'a29', 'a30', 'a31', 'a32', 'a33', 'a34']


In [39]:
# drop duplicates
X_Ionosphere_clean = X_Ionosphere.drop_duplicates()
y_Ionosphere_clean = y_Ionosphere[X_Ionosphere_clean.index]
print(len(X_Ionosphere_clean))

350


In [40]:
X_Ionosphere_clean = X_Ionosphere_clean.reset_index().drop('index', axis=1)
y_Ionosphere_clean = y_Ionosphere_clean.reset_index().drop('index', axis=1)

In [41]:
y_Ionosphere_clean = y_Ionosphere_clean.replace('b', 0)
y_Ionosphere_clean = y_Ionosphere_clean.replace('g', 1)

  y_Ionosphere_clean = y_Ionosphere_clean.replace('g', 1)


Нет категориальныйх признаков

In [42]:
# MinMax scale
X_Ionosphere_scaled = scaler.fit_transform(X_Ionosphere_clean[continuous_cols], y_Ionosphere_clean)
X_Ionosphere_scaled = pd.DataFrame(X_Ionosphere_scaled, columns=continuous_cols)

X_Ionosphere_scaled = pd.concat([X_Ionosphere_scaled, X_Ionosphere_clean['a01']], axis=1)
X_Ionosphere_scaled = pd.concat([X_Ionosphere_scaled, X_Ionosphere_clean['a02']], axis=1)

y_Ionosphere_scaled = y_Ionosphere_clean

# Verify scaling
print(f"After scaling - Min: {X_Ionosphere_scaled.min().min():.2f}, Max: {X_Ionosphere_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Ionosphere_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
     a03  a04  a05  a06  a07  a08  a09  a10  a11  a12  ...  a27  a28  a29  \
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  1.0  1.0  1.0   

     a30  a31  a32  a33  a34  a01  a02  
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
max  1.0  1.0  1.0  1.0  1.0  1.0  0.0  

[2 rows x 34 columns]


In [44]:
Ionosphere_data_new = pd.concat([X_Ionosphere_scaled, y_Ionosphere_scaled], axis=1)

#upload_dataset(Ionosphere_data_new, 'Ionosphere', dataset_project)

In [45]:
print(X_Ionosphere_scaled.index)
print(y_Ionosphere_scaled.index)
print(X_Ionosphere_scaled.index.equals(y_Ionosphere_scaled.index))

RangeIndex(start=0, stop=350, step=1)
RangeIndex(start=0, stop=350, step=1)
True


Forest_coverage Dataset

In [46]:
Forest_coverage_data, Forest_coverage_metadata = fetch_dataset('Forest coverage')
print(Forest_coverage_data, Forest_coverage_metadata)

        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0            2596      51      3                               258   
1            2590      56      2                               212   
2            2804     139      9                               268   
3            2785     155     18                               242   
4            2595      45      2                               153   
...           ...     ...    ...                               ...   
581007       2396     153     20                                85   
581008       2391     152     19                                67   
581009       2386     159     17                                60   
581010       2384     170     15                                60   
581011       2383     165     13                                60   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                                    0                              510   
1        

In [47]:
# Base view
X_Forest = Forest_coverage_data.drop(' class', axis=1)
y_Forest = Forest_coverage_data[' class']

print(f"\nMissing values:\n{X_Forest.isnull().sum()}")

categorical_cols = X_Forest.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Forest.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Forest, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Forest_coverage_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0
Soil_Type11            

In [48]:
# drop duplicates
X_Forest_clean = X_Forest.drop_duplicates()
y_Forest_clean = y_Forest[X_Forest_clean.index]
print(len(X_Forest_clean))

#print(pd.concat([X_Forest_clean, y_Forest_clean], axis=1))

581012


In [49]:
X_Forest_clean_out = X_Forest_clean.reset_index().drop('index', axis=1)
y_Forest_clean_out = y_Forest_clean.reset_index().drop('index', axis=1)

y_Forest_clean_out = y_Forest_clean_out[' class'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

#print(pd.concat([X_Forest_clean_out, y_Forest_clean_out], axis=1))

In [50]:
# MinMax scale
X_Forest_scaled = scaler.fit_transform(X_Forest_clean_out, y_Forest_clean_out)
X_Forest_scaled = pd.DataFrame(X_Forest_scaled, columns=X_Forest_clean.columns)
y_Forest_scaled = y_Forest_clean_out

# Verify scaling
print(f"After scaling - Min: {X_Forest_scaled.min().min():.2f}, Max: {X_Forest_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Forest_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
     Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
min        0.0     0.0    0.0                               0.0   
max        1.0     1.0    1.0                               1.0   

     Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
min                             0.0                              0.0   
max                             1.0                              1.0   

     Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
min            0.0             0.0            0.0   
max            1.0             1.0            1.0   

     Horizontal_Distance_To_Fire_Points  ...  Soil_Type31  Soil_Type32  \
min                                 0.0  ...          0.0          0.0   
max                                 1.0  ...          1.0          1.0   

     Soil_Type33  Soil_Type34  Soil_Type35  Soil_Type36  Soil_Type37  \
min          0.0          0.0          0.0          0.0          0.

In [51]:
Forest_data_new = pd.concat([X_Forest_scaled, y_Forest_scaled], axis=1)

#upload_dataset(Forest_data_new, 'Forest coverage', dataset_project)

Mammography Dataset

In [52]:
Mammography_data, Mammography_metadata = fetch_dataset('Mammography')
print(Mammography_data)

              1         2         3         4         5         6     7
0      0.230020  5.072578 -0.276061  0.832444 -0.377866  0.480322  '-1'
1      0.155491 -0.169390  0.670652 -0.859553 -0.377866 -0.945723  '-1'
2     -0.784415 -0.443654  5.674705 -0.859553 -0.377866 -0.945723  '-1'
3      0.546088  0.131415 -0.456387 -0.859553 -0.377866 -0.945723  '-1'
4     -0.102987 -0.394994 -0.140816  0.979703 -0.377866  1.013566  '-1'
...         ...       ...       ...       ...       ...       ...   ...
11178 -0.250012 -0.377300 -0.321142  1.269157  3.652984  1.092791   '1'
11179  0.281343 -0.417112 -0.366224  0.851010  2.789649  1.345700   '1'
11180  1.204988  1.763724 -0.501468  1.562408  6.489072  0.931294   '1'
11181  0.736644 -0.222474 -0.050653  1.509665  0.539269  1.315229   '1'
11182  0.177003 -0.191508 -0.501468  1.578864  7.750705  1.555951   '1'

[11183 rows x 7 columns]


In [53]:
# Base view
X_Mammography = Mammography_data.drop(' 7', axis=1)
y_Mammography = Mammography_data[' 7']

print(f"\nMissing values:\n{X_Mammography.isnull().sum()}")

categorical_cols = X_Mammography.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Mammography.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Mammography, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Mammography_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
1     0
 2    0
 3    0
 4    0
 5    0
 6    0
dtype: int64

Categorical: 0, Numerical: 6, All: 6
Categorial cols :  []
Numerical cols :  ['1', ' 2', ' 3', ' 4', ' 5', ' 6']
Continuous cols :  ['1', ' 4', ' 5']


In [54]:
# drop duplicates
X_Mammography_clean = X_Mammography.drop_duplicates()
y_Mammography_clean = y_Mammography[X_Mammography_clean.index]
print(len(X_Mammography_clean))

# drop rows with 35% missin values
missing_threshold = 0.35
valid_rows = (X_Mammography_clean.isnull().sum(axis=1) / X_Mammography_clean.shape[1]) < missing_threshold
X_Mammography_clean = X_Mammography_clean[valid_rows]
y_Mammography_clean = y_Mammography_clean[valid_rows]
print(len(X_Mammography_clean))

#print(pd.concat([X_Mammography_clean, y_Mammography_clean], axis=1))

7848
7848


In [55]:
# impute missin values
X_Mammography_clean = general_imputer(X_Mammography_clean, numerical_cols, categorical_cols)

X_Mammography_clean_out = X_Mammography_clean.reset_index().drop('index', axis=1)
y_Mammography_clean_out = y_Mammography_clean.reset_index().drop('index', axis=1)

y_Mammography_clean_out = y_Mammography_clean_out.replace("'-1'", 0)
y_Mammography_clean_out = y_Mammography_clean_out.replace("'1'", 1)

print(pd.concat([X_Mammography_clean_out, y_Mammography_clean_out], axis=1))

After imputation - Missing values: 0
             1         2         3         4         5         6   7
0     0.230020  5.072578 -0.276061  0.832444 -0.377866  0.480322   0
1     0.155491 -0.169390  0.670652 -0.859553 -0.377866 -0.945723   0
2    -0.784415 -0.443654  5.674705 -0.859553 -0.377866 -0.945723   0
3     0.546088  0.131415 -0.456387 -0.859553 -0.377866 -0.945723   0
4    -0.102987 -0.394994 -0.140816  0.979703 -0.377866  1.013566   0
...        ...       ...       ...       ...       ...       ...  ..
7843 -0.250012 -0.377300 -0.321142  1.269157  3.652984  1.092791   1
7844  0.281343 -0.417112 -0.366224  0.851010  2.789649  1.345700   1
7845  1.204988  1.763724 -0.501468  1.562408  6.489072  0.931294   1
7846  0.736644 -0.222474 -0.050653  1.509665  0.539269  1.315229   1
7847  0.177003 -0.191508 -0.501468  1.578864  7.750705  1.555951   1

[7848 rows x 7 columns]


  y_Mammography_clean_out = y_Mammography_clean_out.replace("'1'", 1)


In [56]:
# MinMax scale
X_Mammography_scaled = scaler.fit_transform(X_Mammography_clean_out, y_Mammography_clean_out)
X_Mammography_scaled = pd.DataFrame(X_Mammography_scaled, columns=X_Mammography_clean.columns)
y_Mammography_scaled = y_Mammography_clean_out

# Verify scaling
print(f"After scaling - Min: {X_Mammography_scaled.min().min():.2f}, Max: {X_Mammography_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Mammography_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
       1    2    3    4    5    6
min  0.0  0.0  0.0  0.0  0.0  0.0
max  1.0  1.0  1.0  1.0  1.0  1.0


In [57]:
Mammography_data_new = pd.concat([X_Mammography_scaled, y_Mammography_scaled], axis=1)

# upload_dataset(Mammography_data_new, 'Mammography', dataset_project)

Oil Dataset

In [58]:
Oil_data, Oil_metadata = fetch_dataset('Oil')
Oil_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,target
0,1.0,2558.0,1506.09,456.63,90.0,6395000.0,40.88,7.89,29780.0,0.19,...,2850.00,1000.00,763.16,135.46,3.73,0.0,33243.19,65.74,7.95,1
1,2.0,22325.0,79.11,841.03,180.0,55812500.0,51.11,1.21,61900.0,0.02,...,5750.00,11500.00,9593.48,1648.80,0.60,0.0,51572.04,65.73,6.26,-1
2,3.0,115.0,1449.85,608.43,88.0,287500.0,40.42,7.34,3340.0,0.18,...,1400.00,250.00,150.00,45.13,9.33,1.0,31692.84,65.81,7.84,1
3,4.0,1201.0,1562.53,295.65,66.0,3002500.0,42.40,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1.0,37696.21,65.67,8.07,1
4,5.0,312.0,950.27,440.86,37.0,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0.0,29038.17,65.66,7.35,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932,200.0,12.0,92.42,364.42,135.0,97200.0,59.42,10.34,884.0,0.17,...,381.84,254.56,84.85,146.97,4.50,0.0,2593.50,65.85,6.39,-1
933,201.0,11.0,98.82,248.64,159.0,89100.0,59.64,10.18,831.0,0.17,...,284.60,180.00,150.00,51.96,1.90,0.0,4361.25,65.70,6.53,-1
934,202.0,14.0,25.14,428.86,24.0,113400.0,60.14,17.94,847.0,0.30,...,402.49,180.00,180.00,0.00,2.24,0.0,2153.05,65.91,6.12,-1
935,203.0,10.0,96.00,451.30,68.0,81000.0,59.90,15.01,831.0,0.25,...,402.49,180.00,90.00,73.48,4.47,0.0,2421.43,65.97,6.32,-1


In [59]:
# Base view
X_Oil = Oil_data.drop('target', axis=1)
y_Oil = Oil_data['target']

print(f"\nMissing values:\n{X_Oil.isnull().sum()}")

categorical_cols = X_Oil.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Oil.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Oil, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Oil_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
dtype: int64

Categorical: 0, Numerical: 49, All: 49
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48']
Continuous cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '10', '13', '16', '17', '18', '27', '28', '29', '30', '34', '35', '37', '40', '41', '42', '43', '44', '46', '47', '48']


In [60]:
# drop duplicates
X_Oil_clean = X_Oil.drop_duplicates()
y_Oil_clean = y_Oil[X_Oil.index]
print(len(X_Oil_clean))

# drop rows with 35% missin values
missing_threshold = 0.35
valid_rows = (X_Oil_clean.isnull().sum(axis=1) / X_Oil_clean.shape[1]) < missing_threshold
X_Oil_clean = X_Oil_clean[valid_rows]
y_Oil_clean = y_Oil_clean[valid_rows]
print(len(X_Oil_clean))

#print(pd.concat([X_Oil_clean, y_Oil_clean], axis=1))

937
937


In [61]:
y_Oil_clean.unique()

array([ 1, -1])

In [62]:
y_Oil_clean_out = y_Oil_clean.replace(-1, 0)

#print(pd.concat([X_Oil_clean, y_Oil_clean_out], axis=1))

In [63]:
y_Oil_clean_out.unique()

array([1, 0])

In [64]:
# MinMax scale
X_Oil_scaled = scaler.fit_transform(X_Oil_clean, y_Oil_clean_out)
X_Oil_scaled = pd.DataFrame(X_Oil_scaled, columns=X_Oil_clean.columns)
y_Oil_scaled = y_Oil_clean_out

# Verify scaling
print(f"After scaling - Min: {X_Oil_scaled.min().min():.2f}, Max: {X_Oil_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Oil_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
       0    1    2    3    4    5    6    7    8    9  ...   39   40   41  \
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  1.0  1.0  1.0   

      42   43   44   45   46   47   48  
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  

[2 rows x 49 columns]


In [65]:
Oil_data_new = pd.concat([X_Oil_scaled, y_Oil_scaled], axis=1)

upload_dataset(Oil_data_new, 'Oil', dataset_project)

ClearML results page: https://app.clear.ml/projects/cef1181379804a578be0cf3d239d874f/experiments/b41468a42e4e4066bd6e90600f7cde77/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/cef1181379804a578be0cf3d239d874f/experiments/b41468a42e4e4066bd6e90600f7cde77


Could not repair dependency graph. Error is: 'NoneType' object has no attribute 'values'


Pending uploads, starting dataset upload to https://files.clear.ml
Uploading dataset changes (1 files compressed to 209.86 KiB) to https://files.clear.ml
File compression and upload completed: total size 209.86 KiB, 1 chunk(s) stored (average size 209.86 KiB)


Pima diabetes Dataset

In [66]:
Pima_data, Pima_metadata = fetch_dataset('Pima_Diabetes')
Pima_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [67]:
# Base view
X_Pima = Pima_data.drop('Outcome', axis=1)
y_Pima = Pima_data['Outcome']

print(f"\nMissing values:\n{X_Pima.isnull().sum()}")

categorical_cols = X_Pima.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Pima.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Pima, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Pima_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

Categorical: 0, Numerical: 8, All: 9
Categorial cols :  []
Numerical cols :  ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Continuous cols :  ['Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']


In [68]:
# drop duplicates
X_Pima_clean = X_Pima.drop_duplicates()
y_Pima_clean = y_Pima[X_Pima.index]
print(len(X_Pima_clean))

# drop rows with 35% missin values
missing_threshold = 0.35
valid_rows = (X_Pima_clean.isnull().sum(axis=1) / X_Pima_clean.shape[1]) < missing_threshold
X_Pima_clean = X_Pima_clean[valid_rows]
y_Pima_clean = y_Pima_clean[valid_rows]
print(len(X_Pima_clean))

#print(pd.concat([X_Pima_clean, y_Pima_clean], axis=1))

768
768


In [69]:
y_Pima_clean_out = y_Pima_clean.replace(1, 0)
y_Pima_clean_out = y_Pima_clean.replace(-1, 1)

#print(pd.concat([X_Pima_clean, y_Pima_clean_out], axis=1))

In [70]:
# MinMax scale
X_Pima_scaled = scaler.fit_transform(X_Pima_clean, y_Pima_clean_out)
X_Pima_scaled = pd.DataFrame(X_Pima_scaled, columns=X_Pima_clean.columns)
y_Pima_scaled = y_Pima_clean_out

# Verify scaling
print(f"After scaling - Min: {X_Pima_scaled.min().min():.2f}, Max: {X_Pima_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Pima_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI  \
min          0.0      0.0            0.0            0.0      0.0  0.0   
max          1.0      1.0            1.0            1.0      1.0  1.0   

     DiabetesPedigreeFunction  Age  
min                       0.0  0.0  
max                       1.0  1.0  


In [71]:
Pima_data_new = pd.concat([X_Pima_scaled, y_Pima_scaled], axis=1)

#upload_dataset(Pima_data_new, 'Pima Diabetes', dataset_project)

Satimage Dataset

In [72]:
Satimage_data, Satimage_metadata = fetch_dataset('Satimage')
print(Satimage_data)

         0      1      2     3     4      5      6     7     8      9  ...  \
0     92.0  115.0  120.0  94.0  84.0  102.0  106.0  79.0  84.0  102.0  ...   
1     84.0  102.0  106.0  79.0  84.0  102.0  102.0  83.0  80.0  102.0  ...   
2     84.0  102.0  102.0  83.0  80.0  102.0  102.0  79.0  84.0   94.0  ...   
3     80.0  102.0  102.0  79.0  84.0   94.0  102.0  79.0  80.0   94.0  ...   
4     84.0   94.0  102.0  79.0  80.0   94.0   98.0  76.0  80.0  102.0  ...   
...    ...    ...    ...   ...   ...    ...    ...   ...   ...    ...  ...   
6430  60.0   83.0   96.0  85.0  64.0   87.0  100.0  88.0  64.0   83.0  ...   
6431  64.0   79.0  100.0  85.0  56.0   71.0   96.0  85.0  56.0   68.0  ...   
6432  56.0   68.0   91.0  81.0  56.0   64.0   91.0  81.0  53.0   64.0  ...   
6433  56.0   68.0   87.0  74.0  60.0   71.0   91.0  81.0  60.0   64.0  ...   
6434  60.0   71.0   91.0  81.0  60.0   64.0  104.0  99.0  56.0   64.0  ...   

         27    28     29     30     31    32     33     34    3

In [73]:
# Base view
X_Satimage = Satimage_data.drop('target', axis=1)
y_Satimage = Satimage_data['target']

print(f"\nMissing values:\n{X_Satimage.isnull().sum()}")

categorical_cols = X_Satimage.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Satimage.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Satimage, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Satimage_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
dtype: int64

Categorical: 0, Numerical: 36, All: 36
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35']
Continuous cols :  []


In [74]:
y_Satimage.unique()

array([-1,  1])

In [75]:
# drop duplicates
X_Satimage_clean = X_Satimage.drop_duplicates()
y_Satimage_clean = y_Satimage[X_Satimage.index]
print(len(X_Satimage_clean))

# drop rows with 35% missin values
missing_threshold = 0.35
valid_rows = (X_Satimage_clean.isnull().sum(axis=1) / X_Satimage_clean.shape[1]) < missing_threshold
X_Satimage_clean = X_Satimage_clean[valid_rows]
y_Satimage_clean = y_Satimage_clean[valid_rows]
print(len(X_Satimage_clean))

#print(pd.concat([X_Satimage_clean, y_Satimage_clean], axis=1))

6435
6435


In [76]:
y_Satimage_clean_out = y_Satimage_clean.replace(-1, 0)

#print(pd.concat([X_Satimage_clean, y_Satimage_clean_out], axis=1))

In [77]:
# MinMax scale
X_Satimage_scaled = scaler.fit_transform(X_Satimage_clean, y_Satimage_clean_out)
X_Satimage_scaled = pd.DataFrame(X_Satimage_scaled, columns=X_Satimage_clean.columns)
y_Satimage_scaled = y_Satimage_clean_out

# Verify scaling
print(f"After scaling - Min: {X_Satimage_scaled.min().min():.2f}, Max: {X_Satimage_scaled.max().max():.2f}")
print(f"Feature ranges check:\n{X_Satimage_scaled.describe().loc[['min', 'max']]}")

After scaling - Min: 0.00, Max: 1.00
Feature ranges check:
       0    1    2    3    4    5    6    7    8    9  ...   26   27   28  \
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  1.0  1.0  1.0   

      29   30   31   32   33   34   35  
min  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  

[2 rows x 36 columns]


In [78]:
Satimage_data_new = pd.concat([X_Satimage_scaled, y_Satimage_scaled], axis=1)

#upload_dataset(Satimage_data_new, 'Satimage', dataset_project)

Abalone dataset

In [79]:
Abalone_data, Abalone_metadata = fetch_dataset('Abalone')
Abalone_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.0,0.0,1.0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,-1
1,0.0,0.0,1.0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,1
2,1.0,0.0,0.0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,-1
3,0.0,0.0,1.0,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,-1
4,0.0,1.0,0.0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1
...,...,...,...,...,...,...,...,...,...,...,...
4172,1.0,0.0,0.0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,-1
4173,0.0,0.0,1.0,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,-1
4174,0.0,0.0,1.0,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,-1
4175,1.0,0.0,0.0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,-1


In [80]:
# Base view
X_Abalone = Abalone_data.drop('target', axis=1)
y_Abalone = Abalone_data['target']

print(f"\nMissing values:\n{X_Abalone.isnull().sum()}")

categorical_cols = X_Abalone.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Abalone.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Abalone, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Abalone_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

Categorical: 0, Numerical: 10, All: 10
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Continuous cols :  ['6', '7', '8', '9']


In [81]:
# drop duplicates
X_Abalone_clean = X_Abalone.drop_duplicates()
y_Abalone_clean = y_Abalone[X_Abalone.index]
print(len(X_Abalone_clean))

4177


In [82]:
y_Abalone_clean = y_Abalone_clean.replace(1, 0)
y_Abalone_clean = y_Abalone_clean.replace(-1, 1)

In [83]:
Abalone_data_new = pd.concat([X_Abalone_clean, y_Abalone_clean], axis=1)

#upload_dataset(Abalone_data_new, 'Abalone', dataset_project)

 Ecoli dataset

In [84]:
Ecoli_data, Ecoli_metadata = fetch_dataset('Ecoli')
Ecoli_data

Unnamed: 0,0,1,2,3,4,5,6,target
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,-1
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,-1
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,-1
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,-1
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,-1
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,-1
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,-1
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,-1
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,-1


In [85]:
# Base view
X_Ecoli = Ecoli_data.drop('target', axis=1)
y_Ecoli = Ecoli_data['target']

print(f"\nMissing values:\n{X_Ecoli.isnull().sum()}")

categorical_cols = X_Ecoli.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_Ecoli.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_Ecoli, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}, All: {Ecoli_metadata['features']}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

Categorical: 0, Numerical: 7, All: 7
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6']
Continuous cols :  ['0', '1', '4', '5', '6']


In [86]:
# drop duplicates
X_Ecoli_clean = X_Ecoli.drop_duplicates()
y_Ecoli_clean = y_Ecoli[X_Ecoli.index]
print(len(X_Ecoli_clean))

336


In [87]:
y_Ecoli_clean = y_Ecoli_clean.replace(1, 0)
y_Ecoli_clean = y_Ecoli_clean.replace(-1, 1)

In [88]:
Ecoli_data_new = pd.concat([X_Ecoli_clean, y_Ecoli_clean], axis=1)

#upload_dataset(Ecoli_data_new, 'Ecoli', dataset_project)

us_crime dataset

In [89]:
US_data, US_metadata = fetch_dataset('us_crime')
US_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,target
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,-1
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,1
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,-1
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,-1
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,0.22,0.28,0.34,0.48,0.39,0.01,0.28,0.05,0.00,-1
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,0.53,0.25,0.17,0.10,0.00,0.02,0.37,0.20,0.00,-1
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.25,0.68,0.61,0.79,0.76,0.08,0.32,0.18,0.91,-1
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.45,0.64,0.54,0.59,0.52,0.03,0.38,0.33,0.22,-1


In [90]:
# Base view
X_US = US_data.drop('target', axis=1)
y_US = US_data['target']

print(f"\nMissing values:\n{X_US.isnull().sum()}")

categorical_cols = X_US.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_US.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_US, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

Categorical: 0, Numerical: 100
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']
Continuous cols :  []


In [94]:
y_US_clean = y_US.replace(-1, 0)

In [96]:
y_US_clean.unique()

array([0, 1])

In [None]:
US_data_new = pd.concat([X_US, y_US_clean], axis=1)

# upload_dataset(US_data_new, 'us_crime', dataset_project)

ClearML results page: https://app.clear.ml/projects/a946a2e286114abab58e1cbe7f50cda1/experiments/184a44898be64c9d8a6343b4662e6c67/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/a946a2e286114abab58e1cbe7f50cda1/experiments/184a44898be64c9d8a6343b4662e6c67


Could not repair dependency graph. Error is: 'NoneType' object has no attribute 'values'


Pending uploads, starting dataset upload to https://files.clear.ml
Uploading dataset changes (1 files compressed to 229.24 KiB) to https://files.clear.ml
File compression and upload completed: total size 229.24 KiB, 1 chunk(s) stored (average size 229.24 KiB)


Yeast Dataset


In [101]:
yeast_data, yeast_metadata = fetch_dataset('yeast_ml8')
yeast_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,target
0,0.093700,0.139771,0.062774,0.007698,0.083873,-0.119156,0.073305,0.005510,0.027523,0.043477,...,-0.018712,-0.034711,-0.038675,-0.039102,0.017429,-0.052659,-0.042402,0.118473,0.125632,-1
1,-0.022711,-0.050504,-0.035691,-0.065434,-0.084316,-0.378560,0.038212,0.085770,0.182613,-0.055544,...,0.030594,-0.021814,0.010430,-0.013809,-0.009248,-0.027318,-0.014191,0.022783,0.123785,-1
2,-0.090407,0.021198,0.208712,0.102752,0.119315,0.041729,-0.021728,0.019603,-0.063853,-0.053756,...,0.022294,0.012583,0.002233,-0.002072,-0.010981,0.007615,-0.063378,-0.084181,-0.034402,-1
3,-0.085235,0.009540,-0.013228,0.094063,-0.013592,-0.030719,-0.116062,-0.131674,-0.165448,-0.123053,...,-0.066241,-0.046999,-0.066604,-0.055773,-0.041941,0.051066,0.004976,0.193972,0.131866,-1
4,-0.088765,-0.026743,0.002075,-0.043819,-0.005465,0.004306,-0.055865,-0.071484,-0.159025,-0.111348,...,-0.080882,0.028468,-0.073576,0.050630,0.084832,-0.019570,-0.021650,-0.068326,-0.091155,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.095520,-0.067130,-0.109031,-0.067580,0.006897,-0.074921,0.081651,-0.102767,0.023421,0.030958,...,-0.025955,-0.023820,-0.027757,-0.021961,0.007489,-0.027194,-0.023020,0.021652,0.137082,-1
2413,-0.056845,-0.147821,-0.081818,-0.096280,0.126017,0.062224,0.242514,0.097460,0.035744,-0.078680,...,-0.007117,-0.025083,-0.011406,-0.017818,-0.004101,-0.046161,-0.011562,0.060844,0.137526,-1
2414,0.241600,0.127602,-0.033072,-0.125125,-0.021141,-0.070531,-0.028776,-0.044333,-0.026940,0.010247,...,-0.057654,0.006976,-0.057880,0.171701,0.045545,-0.051639,-0.038713,-0.026947,0.005620,-1
2415,0.097274,0.088109,0.161101,0.081742,0.013824,0.121709,-0.004492,0.003815,-0.037648,-0.048407,...,-0.055524,0.018662,-0.053545,-0.056904,-0.045714,-0.039205,-0.019985,0.280843,0.143382,-1


In [102]:
X_yeast = yeast_data.drop('target', axis=1)
y_yeast = yeast_data['target']

print(f"\nMissing values:\n{X_yeast.isnull().sum()}")

categorical_cols = X_yeast.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_yeast.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = get_continuous(X_yeast, numerical_cols)
print(f"\nCategorical: {len(categorical_cols)}, Numerical: {len(numerical_cols)}")

# Требует ручной проверки
print("Categorial cols : ", categorical_cols)
print("Numerical cols : ", numerical_cols)

print("Continuous cols : ", continuous_cols)


Missing values:
0      0
1      0
2      0
3      0
4      0
      ..
98     0
99     0
100    0
101    0
102    0
Length: 103, dtype: int64

Categorical: 0, Numerical: 103
Categorial cols :  []
Numerical cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102']
Continuous cols :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '2

In [103]:
y_yeast_clean = y_yeast.replace(-1, 0)

In [104]:
y_yeast_clean.unique()

array([0, 1])

In [None]:
yeast_data_new = pd.concat([X_yeast, y_yeast_clean], axis=1)

upload_dataset(yeast_data_new, 'yeast_ml8', dataset_project)