In [1]:
import os
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

# –ì–ª–æ–±–∞–ª—å–Ω—ã–µ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ
TABLE_NAME = "churn_data.csv"  # —Ç–∞–±–ª–∏—Ü–∞ —Å –¥–∞–Ω–Ω—ã–º–∏ (–∏–ª–∏ –ø—É—Ç—å –∫ —Ñ–∞–π–ª—É)
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5001
EXPERIMENT_NAME = "customer-churn-prediction"  # –Ω–∞–∑–≤–∞–Ω–∏–µ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "churn-classifier"  # –Ω–∞–∑–≤–∞–Ω–∏–µ –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø–æ–¥–∫–ª—é—á–µ–Ω–∏—è –∫ MLflow Tracking Server
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment Name: {EXPERIMENT_NAME}")
print(f"Registry Model Name: {REGISTRY_MODEL_NAME}")

MLflow Tracking URI: http://127.0.0.1:5001
Experiment Name: customer-churn-prediction
Registry Model Name: churn-classifier


In [2]:
import psycopg
import pandas as pd
from dotenv import load_dotenv

# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –ø–æ–¥–∫–ª—é—á–µ–Ω–∏—è –∫ PostgreSQL (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Å–≤–æ–∏)
load_dotenv('../.env')

TABLE_NAME = "users_churn"

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

print(f"‚úÖ –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(df)} —Å—Ç—Ä–æ–∫ –∏–∑ —Ç–∞–±–ª–∏—Ü—ã {TABLE_NAME}")
print(f"–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö: {df.shape}")
print("\n–ü–µ—Ä–≤—ã–µ —Å—Ç—Ä–æ–∫–∏:")
print(df.head())

‚úÖ –ó–∞–≥—Ä—É–∂–µ–Ω–æ 7043 —Å—Ç—Ä–æ–∫ –∏–∑ —Ç–∞–±–ª–∏—Ü—ã users_churn
–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö: (7043, 22)

–ü–µ—Ä–≤—ã–µ —Å—Ç—Ä–æ–∫–∏:
     id customer_id begin_date   end_date            type paperless_billing  \
0  2744  9637-CDTKZ 2014-02-01        NaT        Two year               Yes   
1  2746  3946-JEWRQ 2016-03-01        NaT        One year               Yes   
2  2748  7873-CVMAW 2014-02-01        NaT        Two year                No   
3  2750  0463-WZZKO 2019-11-01        NaT  Month-to-month               Yes   
4  2752  3494-JCHRQ 2019-12-01 2020-01-01  Month-to-month               Yes   

            payment_method  monthly_charges  total_charges internet_service  \
0  Credit card (automatic)           114.10        8086.40      Fiber optic   
1  Credit card (automatic)            95.20        4563.00      Fiber optic   
2  Credit card (automatic)            88.55        6362.35              DSL   
3  Credit card (automatic)            20.75          67.10    

In [3]:
# –°–Ω–∞—á–∞–ª–∞ —Å–æ–∑–¥–∞—ë–º obj_df - –¥–∞—Ç–∞—Ñ—Ä–µ–π–º —Å –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–º–∏ –∫–æ–ª–æ–Ω–∫–∞–º–∏
obj_df = df.select_dtypes(include="object")

print(f"–ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏: {obj_df.columns.tolist()}")
print(f"–§–æ—Ä–º–∞ obj_df: {obj_df.shape}\n")

# –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –∫–æ–ª–æ–Ω–æ–∫, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥—É—Ç –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω—ã
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# —Å–æ–∑–¥–∞–Ω–∏–µ –æ–±—ä–µ–∫—Ç–∞ OneHotEncoder –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö
# auto - –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π
# ignore - –∏–≥–Ω–æ—Ä–∏—Ä–æ–≤–∞—Ç—å –æ—à–∏–±–∫–∏, –µ—Å–ª–∏ –≤—Å—Ç—Ä–µ—á–∞–µ—Ç—Å—è –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–∞—è –∫–∞—Ç–µ–≥–æ—Ä–∏—è
# max_categories - –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –∫–∞—Ç–µ–≥–æ—Ä–∏–π
# sparse_output - –≤—ã–≤–æ–¥ –≤ –≤–∏–¥–µ —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω–æ–π –º–∞—Ç—Ä–∏—Ü—ã, –µ—Å–ª–∏ False, —Ç–æ –≤ –≤–∏–¥–µ –æ–±—ã—á–Ω–æ–≥–æ –º–∞—Å—Å–∏–≤–∞
# drop="first" - —É–¥–∞–ª—è–µ—Ç –ø–µ—Ä–≤—É—é –∫–∞—Ç–µ–≥–æ—Ä–∏—é, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –ª–æ–≤—É—à–∫–∏ –º—É–ª—å—Ç–∏–∫–æ–ª–ª–∏–Ω–µ–∞—Ä–Ω–æ—Å—Ç–∏
encoder_oh = OneHotEncoder(
    categories='auto',
    handle_unknown='ignore',
    max_categories=10,
    sparse_output=False,
    drop='first'
)

# –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ OneHotEncoder –∫ –¥–∞–Ω–Ω—ã–º. –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –≤ –º–∞—Å—Å–∏–≤
encoded_features = encoder_oh.fit_transform(df[cat_columns])

# –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –ø–æ–ª—É—á–µ–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ DataFrame –∏ —É—Å—Ç–∞–Ω–æ–≤–∫–∞ –Ω–∞–∑–≤–∞–Ω–∏–π –∫–æ–ª–æ–Ω–æ–∫
# get_feature_names_out() - –ø–æ–ª—É—á–µ–Ω–∏–µ –∏–º—ë–Ω –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ—Å–ª–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è
encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder_oh.get_feature_names_out(cat_columns),
    index=obj_df.index
)

# –∫–æ–Ω–∫–∞—Ç–µ–Ω–∞—Ü–∏—è –∏—Å—Ö–æ–¥–Ω–æ–≥–æ DataFrame —Å –Ω–æ–≤—ã–º DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–º –∑–∞–∫–æ–¥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
# axis=1 –æ–∑–Ω–∞—á–∞–µ—Ç –∫–æ–Ω–∫–∞—Ç–µ–Ω–∞—Ü–∏—é –ø–æ –∫–æ–ª–æ–Ω–∫–∞–º
obj_df = pd.concat([obj_df, encoded_df], axis=1)

print("–†–µ–∑—É–ª—å—Ç–∞—Ç:")
obj_df.head(2)

–ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏: ['customer_id', 'type', 'paperless_billing', 'payment_method', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'gender', 'partner', 'dependents', 'multiple_lines']
–§–æ—Ä–º–∞ obj_df: (7043, 15)

–†–µ–∑—É–ª—å—Ç–∞—Ç:


Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,...,dependents,multiple_lines,type_One year,type_Two year,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,internet_service_Fiber optic,internet_service_None,gender_Male
0,9637-CDTKZ,Two year,Yes,Credit card (automatic),Fiber optic,Yes,Yes,Yes,Yes,Yes,...,No,Yes,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
1,3946-JEWRQ,One year,Yes,Credit card (automatic),Fiber optic,No,No,No,No,Yes,...,No,Yes,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [4]:
from sklearn.impute import SimpleImputer

num_columns = ["monthly_charges", "total_charges"]
n_knots = 3
degree_spline = 4
n_quantiles = 100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

# –°–æ–∑–¥–∞—ë–º num_df - –¥–∞—Ç–∞—Ñ—Ä–µ–π–º —Å —á–∏—Å–ª–æ–≤—ã–º–∏ –∫–æ–ª–æ–Ω–∫–∞–º–∏
num_df = df.select_dtypes(include=['int64', 'float64'])

print(f"–ß–∏—Å–ª–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏: {num_df.columns.tolist()}")
print(f"–§–æ—Ä–º–∞ num_df: {num_df.shape}")

# –ü—Ä–æ–≤–µ—Ä–∏–º –Ω–∞–ª–∏—á–∏–µ –ø—Ä–æ–ø—É—Å–∫–æ–≤
print("\n–ü—Ä–æ–ø—É—Å–∫–∏ –≤ —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–∫–∞—Ö:")
print(df[num_columns].isnull().sum())
print()

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—Å–∫–æ–≤ - –∑–∞–ø–æ–ª–Ω—è–µ–º –º–µ–¥–∏–∞–Ω–æ–π
imputer = SimpleImputer(strategy='median')
df[num_columns] = imputer.fit_transform(df[num_columns])

print("–ü–æ—Å–ª–µ –æ–±—Ä–∞–±–æ—Ç–∫–∏ –ø—Ä–æ–ø—É—Å–∫–æ–≤:")
print(df[num_columns].isnull().sum())
print()

# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns)
)
num_df = pd.concat([num_df, encoded_df], axis=1)

# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns))
# get all columns after the intercept and original features
encoded_df = encoded_df.iloc[:, 1 + len(num_columns):]
encoded_df.columns = encoder_pol.get_feature_names_out(num_columns)[1 + len(num_columns):]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

print("\n–†–µ–∑—É–ª—å—Ç–∞—Ç:")
num_df.head(2)

–ß–∏—Å–ª–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏: ['id', 'monthly_charges', 'total_charges', 'senior_citizen', 'target']
–§–æ—Ä–º–∞ num_df: (7043, 5)

–ü—Ä–æ–ø—É—Å–∫–∏ –≤ —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–∫–∞—Ö:
monthly_charges     0
total_charges      11
dtype: int64

–ü–æ—Å–ª–µ –æ–±—Ä–∞–±–æ—Ç–∫–∏ –ø—Ä–æ–ø—É—Å–∫–æ–≤:
monthly_charges    0
total_charges      0
dtype: int64


–†–µ–∑—É–ª—å—Ç–∞—Ç:


Unnamed: 0,id,monthly_charges,total_charges,senior_citizen,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,...,total_charges_robust,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,2744,114.1,8086.4,0,0,0.0,3e-06,0.05935,0.502083,0.410308,...,1.976414,13018.81,922658.24,65389864.96,1485446.221,105275300.0,7460984000.0,528768600000.0,4.0,4.0
1,2746,95.2,4563.0,0,0,0.0,0.00201,0.183802,0.598345,0.212523,...,0.935335,9063.04,434397.6,20820969.0,862801.408,41354650.0,1982156000.0,95006080000.0,3.0,2.0


–ü—Ä–∏–º–µ–Ω—è—Ç—å –ø–æ –æ—Ç–¥–µ–ª—å–Ω–æ—Å—Ç–∏ –∫–∞–∂–¥–æ–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –Ω–µ –≤—Å–µ–≥–¥–∞ —É–¥–æ–±–Ω–æ, –æ—Å–æ–±–µ–Ω–Ω–æ –∫–æ–≥–¥–∞ –∏—Ö –æ—á–µ–Ω—å –º–Ω–æ–≥–æ. –ü–æ—ç—Ç–æ–º—É –∏—Ö –æ–±—ä–µ–¥–∏–Ω—è—é—Ç –≤ –ø–∞–π–ø–ª–∞–π–Ω –∏–ª–∏ –∫–æ–ª–æ–Ω–æ—á–Ω—ã–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è, –∫–æ—Ç–æ—Ä—ã–µ –º–æ–∂–Ω–æ –≤–∏–∑—É–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å, –∑–∞–ø—É—Å—Ç–∏—Ç—å –æ–¥–Ω–æ–π –∫–æ–º–∞–Ω–¥–æ–π –∏ —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å. –ü–æ—Å–ª–µ–¥—É–π—Ç–µ —ç—Ç–æ–º—É –ø—Ä–∏–º–µ—Ä—É –∏ –æ–±—ä–µ–¥–∏–Ω–∏—Ç–µ –≤—Å–µ —ç–Ω–∫–æ–¥–µ—Ä—ã –≤ –æ–¥–Ω–æ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ. 

### –ó–∞–¥–∞–Ω–∏–µ 3
–ù–∞–ø–∏—à–∏—Ç–µ –∫–æ–¥, –∫–æ—Ç–æ—Ä—ã–π –æ–±—ä–µ–¥–∏–Ω–∏—Ç –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –Ω–∞–¥ —á–∏—Å–ª–æ–≤—ã–º–∏ –∫–æ–ª–æ–Ω–∫–∞–º–∏ –≤ ColumnTransformer, –∞ –Ω–∞–¥ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–º–∏ ‚Äî –≤ Pipeline, –∏—Å–ø–æ–ª—å–∑—É—è —ç–Ω–∫–æ–¥–µ—Ä—ã –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–∏—Ö –∑–∞–¥–∞–Ω–∏–π. –ó–∞—Ç–µ–º –æ–±—ä–µ–¥–∏–Ω–∏—Ç–µ –¥–≤–∞ –ø–æ–ª—É—á–∏–≤—à–∏—Ö—Å—è –æ–±—ä–µ–∫—Ç–∞ –∫–ª–∞—Å—Å–∞ –æ–¥–Ω–∏–º –∫–æ–ª–æ–Ω–æ—á–Ω—ã–º –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ–º. –ü–æ—Å–ª–µ —á–µ–≥–æ –æ–±—ä–µ–¥–∏–Ω–∏—Ç–µ –≤–∞—à –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–Ω—ã–π –Ω–∞–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö —Å –∏–∑–Ω–∞—á–∞–ª—å–Ω—ã–º, –∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç —Å–æ—Ö—Ä–∞–Ω–∏—Ç–µ –≤ –ø–µ—Ä–µ–º–µ–Ω–Ω—É—é df. 

In [5]:
from sklearn.impute import SimpleImputer

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—Å–∫–æ–≤ –ø–µ—Ä–µ–¥ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è–º–∏
imputer = SimpleImputer(strategy='median')
df[num_columns] = imputer.fit_transform(df[num_columns])

numeric_transformer = ColumnTransformer(transformers=[('spl', encoder_spl, num_columns), ('q', encoder_q, num_columns), ('rb', encoder_rb, num_columns), ('pol', encoder_pol, num_columns), ('kbd', encoder_kbd, num_columns)])

categorical_transformer = Pipeline(steps=[('encoder', encoder_oh)])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), ('cat', categorical_transformer, cat_columns)], n_jobs=-1)
encoded_features = preprocessor.fit_transform(df)

# –ü–æ–ª—É—á–∞–µ–º –∏–º–µ–Ω–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ—Å–ª–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–π
feature_names = preprocessor.get_feature_names_out()

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

df = pd.concat([df, transformed_df], axis=1)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male
0,2744,9637-CDTKZ,2014-02-01,NaT,Two year,Yes,Credit card (automatic),114.1,8086.4,Fiber optic,...,4.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
1,2746,3946-JEWRQ,2016-03-01,NaT,One year,Yes,Credit card (automatic),95.2,4563.0,Fiber optic,...,3.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


### –ó–∞–¥–∞–Ω–∏–µ 4
–ß—Ç–æ–±—ã –≤–∏–∑—É–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å –ø–æ–ª—É—á–∏–≤—à–µ–µ—Å—è –æ–±—â–µ–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ, –ø—Ä–æ—Å—Ç–æ –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –Ω–∞ –∑–Ω–∞—á–µ–Ω–∏–µ –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π preprocessor.
–û–±—ä–µ–∫—Ç preprocessor –∫–ª–∞—Å—Å–∞ ColumnTransformer, –æ–±—ä—è–≤–ª–µ–Ω–Ω—ã–π –≤ –ø—Ä–µ–¥—ã–¥—É—â–µ–º –∑–∞–¥–∞–Ω–∏–∏, –º–æ–∂–Ω–æ —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –≤ MLflow, –≤ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é preprocessor ‚Äî —Ç–æ—á–Ω–æ —Ç–∞–∫ –∂–µ, –∫–∞–∫ –∏ –º–æ–¥–µ–ª—å.

In [6]:
import os
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv
import shutil
import tempfile
import pickle

# –ó–∞–≥—Ä—É–∂–∞–µ–º .env —Ñ–∞–π–ª
load_dotenv('../.env')

# –ö–†–ò–¢–ò–ß–ï–°–ö–ò –í–ê–ñ–ù–û: –ù–∞—Å—Ç—Ä–æ–π–∫–∞ S3 –î–û –ª—é–±—ã—Ö –æ–ø–µ—Ä–∞—Ü–∏–π —Å MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –æ–∫—Ä—É–∂–µ–Ω–∏—è
print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –æ–∫—Ä—É–∂–µ–Ω–∏—è:")
print(f"MLFLOW_S3_ENDPOINT_URL: {os.environ.get('MLFLOW_S3_ENDPOINT_URL')}")
print(f"AWS_ACCESS_KEY_ID: {'–£—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞' if os.environ.get('AWS_ACCESS_KEY_ID') else '–ù–ï –£–°–¢–ê–ù–û–í–õ–ï–ù–ê ‚ùå'}")
print(f"AWS_SECRET_ACCESS_KEY: {'–£—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞' if os.environ.get('AWS_SECRET_ACCESS_KEY') else '–ù–ï –£–°–¢–ê–ù–û–í–õ–ï–ù–ê ‚ùå'}")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –≤—Å–µ credentials —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã
if not os.environ.get('AWS_ACCESS_KEY_ID') or not os.environ.get('AWS_SECRET_ACCESS_KEY'):
    raise ValueError("‚ùå AWS credentials –ù–ï —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã! –ü—Ä–æ–≤–µ—Ä—å—Ç–µ —Ñ–∞–π–ª .env")

print("‚úÖ –í—Å–µ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ\n")

# –í–ê–ñ–ù–û: –ü–µ—Ä–µ—Å–æ–∑–¥–∞—ë–º MLflow –∫–ª–∏–µ–Ω—Ç —Å –Ω–æ–≤—ã–º–∏ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–º–∏ –æ–∫—Ä—É–∂–µ–Ω–∏—è
import importlib
importlib.reload(mlflow)
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_experiment(EXPERIMENT_NAME)

# –ü–æ–ª—É—á–µ–Ω–∏–µ experiment_id
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
experiment_id = experiment.experiment_id
print(f"‚úÖ –ù–∞–π–¥–µ–Ω —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç '{EXPERIMENT_NAME}' —Å ID: {experiment_id}\n")

# –°–æ–∑–¥–∞—ë–º –ø—Ä–∏–º–µ—Ä –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–∏–≥–Ω–∞—Ç—É—Ä—ã
input_example = df[num_columns + cat_columns].head(5)

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –¥–∞–Ω–Ω—ã–µ –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è –≤—ã—Ö–æ–¥–Ω–æ–≥–æ –ø—Ä–∏–º–µ—Ä–∞
output_example = preprocessor.transform(input_example)

# –°–æ–∑–¥–∞—ë–º —Å–∏–≥–Ω–∞—Ç—É—Ä—É –º–æ–¥–µ–ª–∏
signature = infer_signature(input_example, output_example)

print("–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è:")
print(f"  Input shape: {input_example.shape}")
print(f"  Output shape: {output_example.shape}")
print(f"  Signature: {signature}\n")

# –ó–∞–ø—É—Å–∫ —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ–º —á–µ—Ä–µ–∑ log_artifact
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    current_run_id = run.info.run_id
    
    print(f"‚úÖ Run ID: {current_run_id}")
    print(f"‚úÖ Run Name: {RUN_NAME}")
    print(f"‚úÖ Artifact URI: {run.info.artifact_uri}\n")
    
    # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
    mlflow.log_param("num_columns", str(num_columns))
    mlflow.log_param("cat_columns", str(cat_columns))
    mlflow.log_param("n_knots", n_knots)
    mlflow.log_param("degree_spline", degree_spline)
    mlflow.log_param("n_quantiles", n_quantiles)
    mlflow.log_param("polynomial_degree", degree)
    mlflow.log_param("n_bins", n_bins)
    mlflow.log_param("total_features", output_example.shape[1])
    
    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ preprocessor —á–µ—Ä–µ–∑ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é
    print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ preprocessor –≤ MLflow —á–µ—Ä–µ–∑ log_artifact...")
    try:
        # –°–æ–∑–¥–∞—ë–º –≤—Ä–µ–º–µ–Ω–Ω—É—é –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –º–æ–¥–µ–ª–∏
        with tempfile.TemporaryDirectory() as tmpdir:
            model_path = os.path.join(tmpdir, "column_transformer")
            
            # –ò—Å–ø–æ–ª—å–∑—É–µ–º mlflow.sklearn.save_model –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –ª–æ–∫–∞–ª—å–Ω–æ
            mlflow.sklearn.save_model(
                sk_model=preprocessor,
                path=model_path,
                signature=signature,
                input_example=input_example
            )
            
            # –õ–æ–≥–∏—Ä—É–µ–º –≤—Å—é –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é –∫–∞–∫ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç
            mlflow.log_artifacts(model_path, artifact_path="column_transformer")
            
        print(f"\n‚úÖ Preprocessor —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω—ë–Ω —á–µ—Ä–µ–∑ log_artifacts!")
        
    except Exception as e:
        print(f"\n‚ùå –û–®–ò–ë–ö–ê –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏: {e}")
        import traceback
        traceback.print_exc()
        raise
    
    print(f"Artifact URI: {mlflow.get_artifact_uri()}")
    print(f"View run: http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}/#/experiments/{experiment_id}/runs/{current_run_id}")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º run_id –¥–ª—è –¥–∞–ª—å–Ω–µ–π—à–µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
run_id = current_run_id

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –∏–∑ S3
print("\n" + "="*70)
print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ preprocessor...\n")

# –î–∞—ë–º –≤—Ä–µ–º—è –Ω–∞ –∑–∞–≥—Ä—É–∑–∫—É –≤ S3
import time
print("–û–∂–∏–¥–∞–Ω–∏–µ 3 —Å–µ–∫—É–Ω–¥—ã –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è –∑–∞–≥—Ä—É–∑–∫–∏ –≤ S3...")
time.sleep(3)

try:
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –∏–∑ MLflow
    model_uri = f"runs:/{run_id}/column_transformer"
    print(f"–ó–∞–≥—Ä—É–∑–∫–∞ –∏–∑: {model_uri}")
    
    loaded_preprocessor = mlflow.sklearn.load_model(model_uri)
    
    # –¢–µ—Å—Ç–∏—Ä—É–µ–º –∑–∞–≥—Ä—É–∂–µ–Ω–Ω—ã–π preprocessor
    test_data = df[num_columns + cat_columns].head(3)
    test_result = loaded_preprocessor.transform(test_data)
    
    print(f"\n‚úÖ Preprocessor —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω –∏ —Ä–∞–±–æ—Ç–∞–µ—Ç!")
    print(f"–¢–µ—Å—Ç–æ–≤–∞—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏—è: {test_data.shape} -> {test_result.shape}")
    
except Exception as e:
    print(f"\n‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*70)
print(f"\nüìù –í–∞—à run_id:")
print(f'run_id = "{run_id}"')

# –§–∏–Ω–∞–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ —á–µ—Ä–µ–∑ boto3
print("\n" + "="*70)
print("–§–∏–Ω–∞–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ S3 —á–µ—Ä–µ–∑ boto3...\n")

try:
    import boto3
    
    s3_client = boto3.client(
        's3',
        endpoint_url=os.environ.get('MLFLOW_S3_ENDPOINT_URL'),
        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
        region_name='ru-central1'
    )
    
    bucket_name = os.getenv('S3_BUCKET_NAME')
    prefix = f"5/{run_id}/artifacts/"
    
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    
    if 'Contents' in response:
        print(f"‚úÖ –í S3 –Ω–∞–π–¥–µ–Ω–æ {len(response['Contents'])} —Ñ–∞–π–ª–æ–≤:")
        for obj in response['Contents']:
            print(f"  - {obj['Key']} ({obj['Size']} bytes)")
    else:
        print(f"‚ùå –í S3 –ù–ï–¢ —Ñ–∞–π–ª–æ–≤ –ø–æ –ø—É—Ç–∏ {prefix}")
        print("\n‚ö†Ô∏è  –í–ê–ñ–ù–û: –ï—Å–ª–∏ —Ñ–∞–π–ª–æ–≤ –Ω–µ—Ç –≤ S3, –Ω–æ –∑–∞–≥—Ä—É–∑–∫–∞ —Ä–∞–±–æ—Ç–∞–µ—Ç,")
        print("  –∑–Ω–∞—á–∏—Ç MLflow –∏—Å–ø–æ–ª—å–∑—É–µ—Ç –ª–æ–∫–∞–ª—å–Ω–æ–µ —Ö—Ä–∞–Ω–∏–ª–∏—â–µ.")
        print("  –ê—Ä—Ç–µ—Ñ–∞–∫—Ç—ã —Å–æ—Ö—Ä–∞–Ω—è—é—Ç—Å—è, –Ω–æ –Ω–µ –≤ S3.")
        
except Exception as e:
    print(f"‚ö†Ô∏è  –ù–µ —É–¥–∞–ª–æ—Å—å –ø—Ä–æ–≤–µ—Ä–∏—Ç—å S3: {e}")
    import traceback
    traceback.print_exc()

–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –æ–∫—Ä—É–∂–µ–Ω–∏—è:
MLFLOW_S3_ENDPOINT_URL: https://storage.yandexcloud.net
AWS_ACCESS_KEY_ID: –£—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞
AWS_SECRET_ACCESS_KEY: –£—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞
‚úÖ –í—Å–µ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ

‚úÖ –ù–∞–π–¥–µ–Ω —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç 'customer-churn-prediction' —Å ID: 5

–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è:
  Input shape: (5, 6)
  Output shape: (5, 36)
  Signature: inputs: 
  ['monthly_charges': double (required), 'total_charges': double (required), 'type': string (required), 'payment_method': string (required), 'internet_service': string (optional), 'gender': string (required)]
outputs: 
  [Tensor('float64', (-1, 36))]
params: 
  None


‚úÖ Run ID: 303d5cf950a544f9980fe5311ad3b804
‚úÖ Run Name: preprocessing
‚úÖ Artifact URI: s3://s3-student-mle-20250507-39f5f3ff21-freetrack/5/303d5cf950a544f9980fe5311ad3b804/artifacts





–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ preprocessor –≤ MLflow —á–µ—Ä–µ–∑ log_artifact...

‚úÖ Preprocessor —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω—ë–Ω —á–µ—Ä–µ–∑ log_artifacts!
Artifact URI: s3://s3-student-mle-20250507-39f5f3ff21-freetrack/5/303d5cf950a544f9980fe5311ad3b804/artifacts
View run: http://127.0.0.1:5001/#/experiments/5/runs/303d5cf950a544f9980fe5311ad3b804
üèÉ View run preprocessing at: http://127.0.0.1:5001/#/experiments/5/runs/303d5cf950a544f9980fe5311ad3b804
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/5

–ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ preprocessor...

–û–∂–∏–¥–∞–Ω–∏–µ 3 —Å–µ–∫—É–Ω–¥—ã –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è –∑–∞–≥—Ä—É–∑–∫–∏ –≤ S3...
–ó–∞–≥—Ä—É–∑–∫–∞ –∏–∑: runs:/303d5cf950a544f9980fe5311ad3b804/column_transformer


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


‚úÖ Preprocessor —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω –∏ —Ä–∞–±–æ—Ç–∞–µ—Ç!
–¢–µ—Å—Ç–æ–≤–∞—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏—è: (3, 6) -> (3, 36)


üìù –í–∞—à run_id:
run_id = "303d5cf950a544f9980fe5311ad3b804"

–§–∏–Ω–∞–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ S3 —á–µ—Ä–µ–∑ boto3...

‚úÖ –í S3 –Ω–∞–π–¥–µ–Ω–æ 7 —Ñ–∞–π–ª–æ–≤:
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/MLmodel (1031 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/conda.yaml (209 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/input_example.json (495 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/model.pkl (13080 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/python_env.yaml (120 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/requirements.txt (94 bytes)
  - 5/303d5cf950a544f9980fe5311ad3b804/artifacts/column_transformer/serving_input_example.json (890 bytes)


### –ó–∞–¥–∞–Ω–∏–µ 5
–¢–µ–ø–µ—Ä—å –æ–±—É—á–∏—Ç–µ –Ω–æ–≤—É—é –º–æ–¥–µ–ª—å —Å –Ω–æ–≤—ã–º–∏ –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏, –ø–æ–ª—É—á–µ–Ω–Ω—ã–º–∏ –ø—Ä–∏ –ø–æ–º–æ—â–∏ –≤–∞—à–∏—Ö —ç–Ω–∫–æ–¥–µ—Ä–æ–≤. –ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –∫–∞–∫ –∏–∑–º–µ–Ω–∏–ª–æ—Å—å –∫–∞—á–µ—Å—Ç–≤–æ –º–æ–¥–µ–ª–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –∏–ª–∏ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ. –ó–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä—É–π—Ç–µ –ø–æ–ª—É—á–µ–Ω–Ω—É—é –≤–µ—Ä—Å–∏—é –º–æ–¥–µ–ª–∏.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
print("="*70)
print("–û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ò –° –ù–û–í–´–ú–ò –ü–†–ò–ó–ù–ê–ö–ê–ú–ò")
print("="*70)

# –í—ã–¥–µ–ª—è–µ–º —Ü–µ–ª–µ–≤—É—é –ø–µ—Ä–µ–º–µ–Ω–Ω—É—é
y = df['target'].values

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –ø—Ä–∏–∑–Ω–∞–∫–∏ —Å –ø–æ–º–æ—â—å—é preprocessor
X_transformed = preprocessor.transform(df[num_columns + cat_columns])

print(f"\nüìä –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö:")
print(f"  –ò—Å—Ö–æ–¥–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏: {len(num_columns + cat_columns)}")
print(f"  –ü–æ—Å–ª–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è: {X_transformed.shape[1]}")
print(f"  –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤: {X_transformed.shape[0]}")
print(f"  –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤: {np.bincount(y)}")

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nüì¶ –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö:")
print(f"  Train: {X_train.shape[0]} –ø—Ä–∏–º–µ—Ä–æ–≤")
print(f"  Test: {X_test.shape[0]} –ø—Ä–∏–º–µ—Ä–æ–≤")

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
print(f"\nü§ñ –û–±—É—á–µ–Ω–∏–µ RandomForestClassifier...")

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

print(f"‚úÖ –ú–æ–¥–µ–ª—å –æ–±—É—á–µ–Ω–∞!")

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# –ú–µ—Ç—Ä–∏–∫–∏
train_metrics = {
    "accuracy": accuracy_score(y_train, y_train_pred),
    "precision": precision_score(y_train, y_train_pred),
    "recall": recall_score(y_train, y_train_pred),
    "f1": f1_score(y_train, y_train_pred)
}

test_metrics = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred),
    "f1": f1_score(y_test, y_test_pred),
    "roc_auc": roc_auc_score(y_test, y_test_proba)
}

print(f"\nüìà –ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–µ:")
for metric_name, metric_value in train_metrics.items():
    print(f"  {metric_name}: {metric_value:.4f}")

print(f"\nüìà –ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:")
for metric_name, metric_value in test_metrics.items():
    print(f"  {metric_name}: {metric_value:.4f}")

print(f"\n‚úÖ –ú–æ–¥–µ–ª—å –≥–æ—Ç–æ–≤–∞ –∫ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏!")

–û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ò –° –ù–û–í–´–ú–ò –ü–†–ò–ó–ù–ê–ö–ê–ú–ò

üìä –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö:
  –ò—Å—Ö–æ–¥–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏: 6
  –ü–æ—Å–ª–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è: 36
  –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤: 7043
  –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤: [5174 1869]

üì¶ –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö:
  Train: 5634 –ø—Ä–∏–º–µ—Ä–æ–≤
  Test: 1409 –ø—Ä–∏–º–µ—Ä–æ–≤

ü§ñ –û–±—É—á–µ–Ω–∏–µ RandomForestClassifier...
‚úÖ –ú–æ–¥–µ–ª—å –æ–±—É—á–µ–Ω–∞!

üìà –ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–µ:
  accuracy: 0.8683
  precision: 0.8365
  recall: 0.6261
  f1: 0.7161

üìà –ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:
  accuracy: 0.7991
  precision: 0.6730
  recall: 0.4733
  f1: 0.5557
  roc_auc: 0.8356

‚úÖ –ú–æ–¥–µ–ª—å –≥–æ—Ç–æ–≤–∞ –∫ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏!


In [8]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import tempfile
import os
import boto3
import traceback

# –£–±–µ–¥–∏–º—Å—è, —á—Ç–æ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ MLflow
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_experiment(EXPERIMENT_NAME)

print("="*70)
print("–õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ò –†–ï–ì–ò–°–¢–†–ê–¶–ò–Ø –ú–û–î–ï–õ–ò –í MLFLOW")
print("="*70)

# –°–æ–∑–¥–∞—ë–º input example –¥–ª—è —Å–∏–≥–Ω–∞—Ç—É—Ä—ã
input_example = X_test[:5]
signature = infer_signature(input_example, model.predict(input_example))

# –ó–∞–ø—É—Å–∫–∞–µ–º –Ω–æ–≤—ã–π run –¥–ª—è –º–æ–¥–µ–ª–∏
with mlflow.start_run(run_name="model_training_with_features") as run:
    model_run_id = run.info.run_id
    
    print(f"\n‚úÖ Run ID: {model_run_id}")
    print(f"‚úÖ Artifact URI: {run.info.artifact_uri}\n")
    
    # –õ–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("min_samples_split", 5)
    mlflow.log_param("min_samples_leaf", 2)
    mlflow.log_param("n_features", X_transformed.shape[1])
    
    # –õ–æ–≥–∏—Ä—É–µ–º –º–µ—Ç—Ä–∏–∫–∏
    print("üìä –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–µ—Ç—Ä–∏–∫...")
    for metric_name, metric_value in train_metrics.items():
        mlflow.log_metric(f"train_{metric_name}", metric_value)
    
    for metric_name, metric_value in test_metrics.items():
        mlflow.log_metric(f"test_{metric_name}", metric_value)
    
    print("‚úÖ –ú–µ—Ç—Ä–∏–∫–∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã")
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å —á–µ—Ä–µ–∑ log_artifacts (–∫–∞–∫ —Å preprocessor)
    print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –≤ MLflow...")
    
    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            model_path = os.path.join(tmpdir, REGISTRY_MODEL_NAME)
            
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –ª–æ–∫–∞–ª—å–Ω–æ
            mlflow.sklearn.save_model(
                sk_model=model,
                path=model_path,
                signature=signature,
                input_example=input_example
            )
            
            # –õ–æ–≥–∏—Ä—É–µ–º –∫–∞–∫ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç
            mlflow.log_artifacts(model_path, artifact_path=REGISTRY_MODEL_NAME)
            
        print(f"‚úÖ –ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞!")
        
    except Exception as e:
        print(f"‚ùå –û–®–ò–ë–ö–ê –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏: {e}")
        traceback.print_exc()
        raise

print(f"\nüîó View run: http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}/#/experiments/{experiment.experiment_id}/runs/{model_run_id}")

# –†–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏—è –º–æ–¥–µ–ª–∏ –≤ Model Registry
print(f"\n{'='*70}")
print("–†–ï–ì–ò–°–¢–†–ê–¶–ò–Ø –ú–û–î–ï–õ–ò –í MODEL REGISTRY")
print("="*70)

model_uri = f"runs:/{model_run_id}/{REGISTRY_MODEL_NAME}"

print(f"\nüìù –†–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏—è –º–æ–¥–µ–ª–∏...")
print(f"  Model URI: {model_uri}")
print(f"  Model Name: {REGISTRY_MODEL_NAME}")

try:
    # –†–µ–≥–∏—Å—Ç—Ä–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å
    model_version = mlflow.register_model(
        model_uri=model_uri,
        name=REGISTRY_MODEL_NAME,
        tags={"features": "transformed", "preprocessor": "column_transformer"}
    )
    
    print(f"\n‚úÖ –ú–æ–¥–µ–ª—å –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∞!")
    print(f"  Model Name: {model_version.name}")
    print(f"  Version: {model_version.version}")
    print(f"  Run ID: {model_version.run_id}")
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –¥–ª—è –∑–∞–¥–∞–Ω–∏—è
    model_version_id = model_version.version
    model_registred_name = model_version.name
    model_run_id_final = model_version.run_id
    
except Exception as e:
    print(f"\n‚ùå –û–®–ò–ë–ö–ê –ø—Ä–∏ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –º–æ–¥–µ–ª–∏: {e}")
    traceback.print_exc()

# –ü—Ä–æ–≤–µ—Ä–∫–∞ S3
print(f"\n{'='*70}")
print("–ü–†–û–í–ï–†–ö–ê –°–û–•–†–ê–ù–ï–ù–ò–Ø –í S3")
print("="*70)

import time
print("\n–û–∂–∏–¥–∞–Ω–∏–µ 3 —Å–µ–∫—É–Ω–¥—ã...")
time.sleep(3)

try:
    s3_client = boto3.client(
        's3',
        endpoint_url=os.environ.get('MLFLOW_S3_ENDPOINT_URL'),
        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
        region_name='ru-central1'
    )
    
    bucket_name = os.getenv('S3_BUCKET_NAME')
    prefix = f"5/{model_run_id}/artifacts/"
    
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    
    if 'Contents' in response:
        print(f"\n‚úÖ –í S3 –Ω–∞–π–¥–µ–Ω–æ {len(response['Contents'])} —Ñ–∞–π–ª–æ–≤:")
        for obj in response['Contents'][:10]:  # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø–µ—Ä–≤—ã–µ 10
            print(f"  - {obj['Key']} ({obj['Size']} bytes)")
        if len(response['Contents']) > 10:
            print(f"  ... –∏ –µ—â—ë {len(response['Contents']) - 10} —Ñ–∞–π–ª–æ–≤")
    else:
        print(f"\n‚ö†Ô∏è  –í S3 –ø–æ–∫–∞ –Ω–µ—Ç —Ñ–∞–π–ª–æ–≤ (–æ–Ω–∏ –º–æ–≥—É—Ç –±—ã—Ç—å –≤ –ø—Ä–æ—Ü–µ—Å—Å–µ –∑–∞–≥—Ä—É–∑–∫–∏)")
        
except Exception as e:
    print(f"\n‚ö†Ô∏è  –û—à–∏–±–∫–∞ –ø—Ä–æ–≤–µ—Ä–∫–∏ S3: {e}")

print(f"\n{'='*70}")
print("–ò–¢–û–ì–û–í–ê–Ø –ò–ù–§–û–†–ú–ê–¶–ò–Ø –î–õ–Ø –ó–ê–î–ê–ù–ò–Ø 5")
print("="*70)
print(f"\nmodel_version_id = {model_version_id}")
print(f'model_registred_name = "{model_registred_name}"')
print(f'run_id = "{model_run_id_final}"')

–õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ò –†–ï–ì–ò–°–¢–†–ê–¶–ò–Ø –ú–û–î–ï–õ–ò –í MLFLOW

‚úÖ Run ID: 83f3cab20d5f4055b65f801167e942f2
‚úÖ Artifact URI: s3://s3-student-mle-20250507-39f5f3ff21-freetrack/5/83f3cab20d5f4055b65f801167e942f2/artifacts

üìä –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–µ—Ç—Ä–∏–∫...
‚úÖ –ú–µ—Ç—Ä–∏–∫–∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã

üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –≤ MLflow...
‚úÖ –ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞!
üèÉ View run model_training_with_features at: http://127.0.0.1:5001/#/experiments/5/runs/83f3cab20d5f4055b65f801167e942f2
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/5

üîó View run: http://127.0.0.1:5001/#/experiments/5/runs/83f3cab20d5f4055b65f801167e942f2

–†–ï–ì–ò–°–¢–†–ê–¶–ò–Ø –ú–û–î–ï–õ–ò –í MODEL REGISTRY

üìù –†–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏—è –º–æ–¥–µ–ª–∏...
  Model URI: runs:/83f3cab20d5f4055b65f801167e942f2/churn-classifier
  Model Name: churn-classifier


Successfully registered model 'churn-classifier'.
2025/12/05 14:30:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: churn-classifier, version 1
Created version '1' of model 'churn-classifier'.



‚úÖ –ú–æ–¥–µ–ª—å –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∞!
  Model Name: churn-classifier
  Version: 1
  Run ID: 83f3cab20d5f4055b65f801167e942f2

–ü–†–û–í–ï–†–ö–ê –°–û–•–†–ê–ù–ï–ù–ò–Ø –í S3

–û–∂–∏–¥–∞–Ω–∏–µ 3 —Å–µ–∫—É–Ω–¥—ã...

‚úÖ –í S3 –Ω–∞–π–¥–µ–Ω–æ 7 —Ñ–∞–π–ª–æ–≤:
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/MLmodel (886 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/conda.yaml (209 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/input_example.json (2477 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/model.pkl (4201880 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/python_env.yaml (120 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/requirements.txt (94 bytes)
  - 5/83f3cab20d5f4055b65f801167e942f2/artifacts/churn-classifier/serving_input_example.json (3627 bytes)

–ò–¢–û–ì–û–í–ê–Ø –ò–ù–§–û–†–ú–ê–¶–ò–Ø –î–õ–Ø –ó–ê–î–ê–ù–ò–Ø 5

model_version

In [9]:
# –ó–∞–¥–∞–Ω–∏–µ 5: –ò—Ç–æ–≥–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏

model_version_id = model_version_id  # –Ω–æ–º–µ—Ä –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
model_registred_name = model_registred_name  # –Ω–∞–∑–≤–∞–Ω–∏–µ –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
run_id = model_run_id_final  # run_id, –≤ —Ä–∞–º–∫–∞—Ö –∫–æ—Ç–æ—Ä–æ–≥–æ –±—ã–ª–∞ –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∞ –º–æ–¥–µ–ª—å

print("‚úÖ –ó–Ω–∞—á–µ–Ω–∏—è –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –∑–∞–¥–∞–Ω–∏—è:")
print(f"model_version_id = {model_version_id}")
print(f'model_registred_name = "{model_registred_name}"')
print(f'run_id = "{run_id}"')

‚úÖ –ó–Ω–∞—á–µ–Ω–∏—è –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –∑–∞–¥–∞–Ω–∏—è:
model_version_id = 1
model_registred_name = "churn-classifier"
run_id = "83f3cab20d5f4055b65f801167e942f2"
