In [3]:
# import sys
# !{sys.executable} -m pip install autofeat

Defaulting to user installation because normal site-packages is not writeable
Collecting autofeat
  Downloading autofeat-2.1.3-py3-none-any.whl (23 kB)
Collecting numba>=0.53.1
  Downloading numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pint<1.0,>=0.17
  Downloading Pint-0.24.4-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting sympy<2.0.0,>=1.7.1
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting llvmlite<0.45,>=0.44.0dev0
  Downloading llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [16]:
# libs
import os
import psycopg
import pandas as pd
import mlflow
from autofeat import AutoFeatRegressor, AutoFeatClassifier
from sklearn.model_selection import train_test_split

In [35]:
# credentials

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

TABLE_NAME = 'users_churn'

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_preprocessing_alexndem'
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'churn_model_alexndem'

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

### Выгрузка данных

In [52]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2) 

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,gender,streaming_movies,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,Female,No,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,Male,No,0,No,No,No,0


### Генерация фичей

In [53]:
df.target.value_counts()

target
0    5174
1    1869
Name: count, dtype: int64

In [54]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    'type'
]
num_features = ["monthly_charges", "total_charges"]
target = ['target'] # колонка с таргетом вашей модели

split_column = "begin_date"
test_size = 0.2

df['senior_citizen'] = df['senior_citizen'].map({1:'Yes', 0:'No'})
df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[cat_features + num_features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

In [55]:


features = cat_features + num_features

transformations = ('1/', 'log', 'abs', 'sqrt')

afc = AutoFeatClassifier(categorical_cols=cat_features, transformations=transformations, feateng_steps=1, n_jobs=-1)

X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)

  y = column_or_1d(y, warn=True)


### Логирование autofeat

In [36]:
artifact_path = "afc"
experiment_id = mlflow.get_experiment_by_name("churn_task_alexdem").experiment_id

with mlflow.start_run(run_name="autofeatures_churn", experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path) 

2025-06-03 22:08:22,435 INFO: Found credentials in environment variables.


### Обучение и логирование модели

In [56]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score, f1_score, log_loss

from catboost import CatBoostClassifier
from sklearn.metrics import mean_absolute_error, auc

In [57]:
model = CatBoostClassifier(iterations=300, verbose=False)
model.fit(X_train_features, y_train)
proba = model.predict_proba(X_test_features)[:, 1]
prediction = model.predict(X_test_features)

In [58]:
_, err1, _, err2 = confusion_matrix(y_test, prediction).ravel()
auc = roc_auc_score(y_test, proba)
logloss = log_loss(y_test, proba)

In [59]:
EXPERIMENT_NAME = "churn_task_alexdem"
RUN_NAME = "model_autofeatures"
REGISTRY_MODEL_NAME = "churn_model_alexdem_b2c"

pip_requirements= "./requirements.txt"
signature = mlflow.models.infer_signature(X_test_features.values, prediction)
input_example = X_test_features[:10]
metadata = {'model_type': 'monthly'}


experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    model_info = mlflow.catboost.log_model( 
			cb_model=model,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            metadata=metadata,
            await_registration_for=60
		)
    mlflow.log_metrics({'auc': auc, 'logloss': logloss, 'err1': err1, 'err2': err2})

Registered model 'churn_model_alexdem_b2c' already exists. Creating a new version of this model...
2025/06/03 22:25:45 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_alexdem_b2c, version 6
Created version '6' of model 'churn_model_alexdem_b2c'.


In [60]:
run_id

'24522838d07d4b2380dab0fc1c731416'