In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

RAND = 123
K = 5

In [None]:
df = pd.read_csv(r"D:\DataScience\Projects\Cardio\data\data_cleaned\data_cleaned.csv")

## 03 Processing_data

In [None]:
# Выделение данных для финальной провекри после этапа MLOps
df_final_check = df.sample(n=20, random_state=123)
final_check_index = df_final_check.index
df = df.drop(index=final_check_index)
X_final_check = df_final_check.drop('cardio',axis=1)
y_final_check = df_final_check.cardio

X = df.drop('cardio', axis=1)
y = df.cardio
print(f"X: {X.shape}")

# Разбиваем на train, valid, test (без утечки)
X_train, X_valid_test, y_train, y_valid_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid_test, y_valid_test, test_size=0.5, stratify=y_valid_test, random_state=123)
print(f"X_train: {X_train.shape}")

# Определяем типы данных
num_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']  # Числовые признаки для масштабирования
bin_features = ['gender', 'smoke', 'alco', 'active']  # Бинарные признаки (их не кодируем)
cat_features = ['cholesterol', 'gluc']  # Категориальные (небинарные) → их кодируем OneHot

# Преобразователь для небинарных категориальных признаков
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Преобразователь для числовых признаков
numeric_transformer = StandardScaler()

# Объединение в ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', categorical_transformer, cat_features)],
                                 remainder='passthrough')

data_processing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_scale = data_processing_pipeline.fit_transform(X_train)
X_valid_scale = data_processing_pipeline.transform(X_valid)
X_test_scale = data_processing_pipeline.transform(X_test)
print(f"X_train_scale: {X_train_scale.shape}")

joblib.dump(data_processing_pipeline, r"D:\DataScience\Projects\Cardio\pipeline\preprocessing_pipeline.joblib")

# Получить имена признаков после OneHotEncoding
cat_features_names = data_processing_pipeline.named_steps['preprocessor'] \
                                             .named_transformers_['cat'] \
                                             .get_feature_names_out(cat_features).tolist()

# Собираем полный список признаков в правильном порядке
full_features_names = num_features + cat_features_names + bin_features