In [16]:
import os
from dotenv import load_dotenv
load_dotenv()
import psycopg
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

TABLE_NAME = "users_churn" # таблица с данными
connection = {
    "sslmode": "require",
    "target_session_attrs": "read-write",
    "connect_timeout": 10
}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,12,7469-LKBCI,2018-10-01,NaT,Two year,No,Credit card (automatic),18.95,326.8,,...,,,,,Male,0,No,No,No,0
1,13,8091-TTVAX,2015-04-01,NaT,One year,No,Credit card (automatic),100.35,5681.1,Fiber optic,...,Yes,No,Yes,Yes,Male,0,Yes,No,Yes,0


In [18]:
from autofeat import AutoFeatClassifier  # или AutoFeatClassifier в зависимости от задачи
from sklearn.model_selection import train_test_split


cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]
target = "target"
features = cat_features + num_features
transformations = ('1/', 'log', 'abs', 'sqrt')

split_column = "begin_date"
test_size = 0.2
df = df.sort_values(by=[split_column])

afc = AutoFeatClassifier(
    categorical_cols=cat_features, 
    transformations=transformations, 
    feateng_steps=1, 
    n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 
X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)

In [28]:
import mlflow
from mlflow.models import infer_signature

experiment_name = "autofeat_experiment"
run_name = "autofeat_run_1"

# Создание или получение эксперимента
try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
    else:
        experiment_id = experiment.experiment_id
except Exception:
    experiment_id = mlflow.create_experiment(experiment_name)

artifact_path = "afc"

with mlflow.start_run(run_name=run_name, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    # Создание сигнатуры модели
    signature = infer_signature(X_train, afc.predict(X_train))

    # Пример входных данных (важно, чтобы совпадал по структуре с обучением)
    input_example = X_train_features.head(5)

    # Логирование модели с указанием имени, сигнатуры и примера
    afc_info = mlflow.sklearn.log_model(
        sk_model=afc,
        artifact_path=artifact_path,
        signature=signature,
        input_example=input_example
    )

    # Логирование параметров
    mlflow.log_params({
        "feateng_steps": 1,
        "test_size": test_size,
        "n_jobs": -1,
        "transformations": str(transformations),
        "num_original_features": len(features),
        "num_final_features": X_train_features.shape[1]
    })

    # Логирование метрик
    original_features_count = len(features)
    new_features_count = X_train_features.shape[1] - original_features_count

    mlflow.log_metrics({
        "original_features_count": original_features_count,
        "final_features_count": new_features_count,
        "generated_features_count": new_features_count
    })

    # Логирование информации о признаках
    mlflow.log_param("target_column", target)
    mlflow.log_param("split_column", split_column)

print(f"Модель залогирована в MLflow")
print(f"Run ID: {run_id}")
print(f"Model URI: {afc_info.model_uri}")

print(mlflow.get_artifact_uri())

  "dataframe_split": {
    "columns": [
      "m.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Failed to enforce schema of data '   monthly_charges  total_charges  cat_paperless_billing_No  \
0           117.80        8684.80                       0.0   
1           104.15        7689.95                       0.0   
2            92.45        6440.25                       1.0   
3           108.05        7532.15                       0.0   
4           108.60        7690.90                       0.0   

   cat_paperless_billing_Yes  cat_payment_method_Bank transfer (automatic)  \
0                        1.0               

Модель залогирована в MLflow
Run ID: ad4001bacb1042beac526c546c520362
Model URI: models:/m-40d797e4caef442a8b2353409a6ddcb6
file:///home/mle-user/mle_projects/mle-mlflow/mlruns/0/106d8e647bec4abf81867be4f918456d/artifacts
