# Демонстрация работы `DataPreprocessor`

### Используем датасет *Titanic* из *seaborn* — он содержит пропуски, категориальные и числовые столбцы, что идеально для демонстрации.

In [19]:
import pandas as pd
import seaborn as sns
from data_preprocessor import DataPreprocessor

## 1. Загрузка данных

In [20]:
df = sns.load_dataset("titanic")
print(f"Форма: {df.shape}")
df.head()

Форма: (891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Посмотрим на пропуски и типы данных столбцов

In [23]:
df.isna().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
sex              0
pclass           0
survived         0
fare             0
parch            0
sibsp            0
class            0
adult_male       0
who              0
alive            0
alone            0
dtype: int64

In [24]:
df.dtypes

survived          int64
pclass            int64
sex                 str
age             float64
sibsp             int64
parch             int64
fare            float64
embarked            str
class          category
who                 str
adult_male         bool
deck           category
embark_town         str
alive               str
alone              bool
dtype: object

## 2. Быстрый запуск через `fit_transform`

### Одной строкой: 
### - удаляем столбцы с >50 % пропусков, заполняем пропуски в числовых столбцах средним, в строковых модой;
### - кодируем категории;
### - нормализуем числовые столбцы по стратегии Min-Max;

In [25]:
prep = DataPreprocessor(df)
result = prep.fit_transform(threshold=0.5, fill_strategy="mean", norm_method="minmax")
print(f"Форма результата: {result.shape}")
result.head()

Форма результата: (891, 24)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,class_Second,class_Third,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0.0,1.0,0.271174,0.125,0.0,0.014151,True,False,0,1,...,0,1,0,1,0,0,0,1,1,0
1,1.0,0.0,0.472229,0.125,0.0,0.139136,False,False,1,0,...,0,0,0,0,1,1,0,0,0,1
2,1.0,1.0,0.321438,0.0,0.0,0.015469,False,True,1,0,...,0,1,0,0,1,0,0,1,0,1
3,1.0,0.0,0.434531,0.125,0.0,0.103644,False,False,1,0,...,0,0,0,0,1,0,0,1,0,1
4,0.0,1.0,0.434531,0.0,0.0,0.015713,True,True,0,1,...,0,1,0,1,0,0,0,1,1,0


In [26]:
print(prep.summary())

=== DataPreprocessor summary ===
Удалённые столбцы (1): ['deck']
Заполненные столбцы: ['age', 'embarked', 'embark_town']
OHE-исходные столбцы: ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']
OHE-новые столбцы (16): ['sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child', 'who_man', 'who_woman', 'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no', 'alive_yes']
Метод нормализации: minmax
Нормализованные столбцы: ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
Итоговая форма DataFrame: (891, 24)


## 3. Пошаговое применение с другими параметрами

### Удаляем столбцы с >30 % пропусков, заполняем медианой

In [27]:
prep2 = DataPreprocessor(df)

prep2.remove_missing(threshold=0.1, fill_strategy="median")
print("После remove_missing:", prep2.result.shape)
print("Удалены:", prep2.state.dropped_columns)

После remove_missing: (891, 13)
Удалены: ['age', 'deck']


### One-hot encoding

In [28]:
prep2.encode_categorical()
print("После encode_categorical:", prep2.result.shape)
print("Новые OHE-столбцы:", prep2.state.onehot_new_columns[:10], "...")

После encode_categorical: (891, 23)
Новые OHE-столбцы: ['sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child', 'who_man'] ...


### Z-score стандартизация

In [8]:
prep2.normalize_numeric(method="std")
result2 = prep2.result
print("После normalize_numeric:", result2.shape)
result2.describe().round(3)

После normalize_numeric: (891, 24)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,...,class_Second,class_Third,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.0,-0.0,0.0,0.0,0.0,0.0,0.352,0.648,0.189,0.086,...,0.207,0.551,0.093,0.603,0.304,0.189,0.086,0.725,0.616,0.384
std,1.001,1.001,1.001,1.001,1.001,1.001,0.478,0.478,0.391,0.281,...,0.405,0.498,0.291,0.49,0.46,0.391,0.281,0.447,0.487,0.487
min,-0.789,-1.566,-2.224,-0.475,-0.474,-0.648,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.789,-0.369,-0.566,-0.475,-0.474,-0.489,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.789,0.827,-0.105,-0.475,-0.474,-0.357,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.267,0.827,0.433,0.433,-0.474,-0.024,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
max,1.267,0.827,3.892,6.784,6.974,9.667,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 4. Применение сохранённого pipeline к новым данным `transform`

### Разделим исходный датасет

In [29]:
df_train = df.iloc[:700].copy()
df_test = df.iloc[700:].copy()

In [30]:
prep3 = DataPreprocessor(df_train)
train_processed = prep3.fit_transform(threshold=0.5, norm_method="minmax")

### Применяем тот же pipeline к тестовой выборке

In [31]:
test_processed = prep3.transform(df_test)

In [32]:
print(f"Train: {train_processed.shape}, Test: {test_processed.shape}")
print(f"Столбцы совпадают: {list(train_processed.columns) == list(test_processed.columns)}")

Train: (700, 24), Test: (191, 24)
Столбцы совпадают: True


In [33]:
test_processed.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,class_Second,class_Third,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
700,1.0,0.0,0.217666,0.125,0.0,0.444099,False,False,1,0,...,0,0,0,0,1,1,0,0,0,1
701,1.0,0.0,0.432177,0.0,0.0,0.05131,True,True,0,1,...,0,0,0,1,0,0,0,1,0,1
702,0.0,1.0,0.217666,0.0,0.166667,0.028213,False,False,1,0,...,0,1,0,0,1,1,0,0,1,0
703,0.0,1.0,0.305994,0.0,0.0,0.015111,True,True,0,1,...,0,1,0,1,0,0,1,0,1,0
704,0.0,1.0,0.318612,0.125,0.0,0.01533,True,False,0,1,...,0,1,0,1,0,0,0,1,1,0


## 5. Обработка ошибок

### Некорректный тип данных

In [15]:
try:
    DataPreprocessor([1, 2, 3])
except TypeError as e:
    print(f"TypeError: {e}")

TypeError: Ожидается pandas DataFrame, получен list.


### Некорректный threshold

In [34]:
try:
    prep.remove_missing(threshold=1.5)
except ValueError as e:
    print(f"ValueError: {e}")

ValueError: threshold должен быть в диапазоне [0, 1], получен 1.5.


### Некорректный метод нормализации

In [35]:
try:
    prep.normalize_numeric(method="max")
except ValueError as e:
    print(f"ValueError: {e}")

ValueError: method должен быть одним из ('minmax', 'std'), получен 'max'.


### transform без fit

In [36]:
try:
    fresh = DataPreprocessor(df)
    fresh.transform(df)
except RuntimeError as e:
    print(f"RuntimeError: {e}")

RuntimeError: Pipeline не обучен. Сначала вызовите fit_transform().
