In [36]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
import psycopg
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split



TABLE_NAME =  'clean_users_churn'

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'real_churn_Andrey'
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "real_churn_model_Andrey"

In [27]:

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [2]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

# определим название таблицы, в которой хранятся наши данные.
TABLE_NAME = "clean_users_churn"

# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
                # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

                # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)

In [3]:
df

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,14,5918-VUKWP,2017-06-01,NaT,One year,No,Bank transfer (automatic),20.55,654.55,Fiber optic,...,No,No,No,No,Female,0,No,No,No,0
1,15,1744-JHKYS,2017-04-01,NaT,Month-to-month,No,Electronic check,24.70,780.20,Fiber optic,...,No,No,No,No,Female,0,Yes,No,Yes,0
2,16,2984-RGEYA,2014-05-01,NaT,Two year,No,Bank transfer (automatic),19.75,1375.40,Fiber optic,...,No,No,No,No,Female,0,Yes,Yes,No,0
3,17,9680-NIAUV,2014-02-01,NaT,Two year,No,Credit card (automatic),109.70,8129.30,Fiber optic,...,Yes,No,Yes,Yes,Female,0,Yes,Yes,Yes,0
4,18,2146-EGVDT,2015-03-01,NaT,Two year,No,Credit card (automatic),19.30,1192.70,Fiber optic,...,No,No,No,No,Male,0,Yes,Yes,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,7015,6502-MJQAE,2019-11-01,2019-12-01,Month-to-month,Yes,Electronic check,69.60,69.60,Fiber optic,...,No,No,No,No,Male,0,No,No,No,1
7015,7016,6257-DTAYD,2014-03-01,NaT,Two year,Yes,Credit card (automatic),104.15,7365.30,Fiber optic,...,No,Yes,Yes,Yes,Male,0,Yes,No,Yes,0
7016,7017,4616-ULAOA,2014-09-01,NaT,Two year,Yes,Credit card (automatic),110.80,7245.90,Fiber optic,...,Yes,Yes,Yes,Yes,Female,0,Yes,Yes,Yes,0
7017,7018,7693-LCKZL,2019-05-01,2019-10-01,Month-to-month,Yes,Electronic check,80.15,385.00,Fiber optic,...,No,No,No,No,Male,0,Yes,Yes,Yes,1


In [4]:
obj_df = df.select_dtypes(include="object")

In [5]:
obj_df

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,5918-VUKWP,One year,No,Bank transfer (automatic),Fiber optic,No,No,No,No,No,No,Female,No,No,No
1,1744-JHKYS,Month-to-month,No,Electronic check,Fiber optic,No,No,No,No,No,No,Female,Yes,No,Yes
2,2984-RGEYA,Two year,No,Bank transfer (automatic),Fiber optic,No,No,No,No,No,No,Female,Yes,Yes,No
3,9680-NIAUV,Two year,No,Credit card (automatic),Fiber optic,Yes,Yes,Yes,No,Yes,Yes,Female,Yes,Yes,Yes
4,2146-EGVDT,Two year,No,Credit card (automatic),Fiber optic,No,No,No,No,No,No,Male,Yes,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,6502-MJQAE,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,No,Male,No,No,No
7015,6257-DTAYD,Two year,Yes,Credit card (automatic),Fiber optic,No,Yes,No,Yes,Yes,Yes,Male,Yes,No,Yes
7016,4616-ULAOA,Two year,Yes,Credit card (automatic),Fiber optic,No,Yes,Yes,Yes,Yes,Yes,Female,Yes,Yes,Yes
7017,7693-LCKZL,Month-to-month,Yes,Electronic check,Fiber optic,No,Yes,No,No,No,No,Male,Yes,Yes,Yes


In [6]:
cat_columns = ["type", "payment_method", "internet_service", "gender"]

In [7]:
encoder_oh = OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore', max_categories=10, sparse_output=False )
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_oh.get_feature_names_out())
obj_df = pd.concat([obj_df, encoded_df], axis=1)


In [8]:
num_columns = ["monthly_charges", "total_charges"]
num_df = df[num_columns]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None


In [9]:
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns)
)
num_df = pd.concat([num_df, encoded_df], axis=1)


In [10]:
num_df

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5
0,20.55,654.55,0.034546,0.434971,0.480648,0.049834,1.828756e-07,0.000000e+00,0.022088,0.381092,0.524849,0.071952,1.930988e-05,0.000000
1,24.70,780.20,0.024051,0.391047,0.517404,0.067486,1.131054e-05,0.000000e+00,0.019235,0.365307,0.536000,0.079418,3.972697e-05,0.000000
2,19.75,1375.40,0.036910,0.443198,0.473023,0.046869,3.308335e-08,0.000000e+00,0.009277,0.291028,0.577428,0.121867,4.003513e-04,0.000000
3,109.70,8129.30,0.000000,0.000044,0.080590,0.537616,3.629201e-01,1.882926e-02,0.000000,0.000011,0.067449,0.517339,3.911318e-01,0.024069
4,19.30,1192.70,0.038292,0.447781,0.468667,0.045260,7.943313e-09,0.000000e+00,0.011773,0.313568,0.566849,0.107586,2.244701e-04,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,69.60,69.60,0.000000,0.038136,0.447273,0.469154,4.543660e-02,9.567869e-09,0.039747,0.452438,0.464160,0.043655,7.872068e-10,0.000000
7015,104.15,7365.30,0.000000,0.000297,0.114096,0.572020,3.030312e-01,1.055559e-02,0.000000,0.000358,0.118878,0.575441,2.955746e-01,0.009748
7016,110.80,7245.90,0.000000,0.000026,0.074848,0.529357,3.748469e-01,2.092213e-02,0.000000,0.000507,0.128659,0.581534,2.810079e-01,0.008293
7017,80.15,385.00,0.000000,0.014508,0.334725,0.555308,9.533974e-02,1.203783e-04,0.029268,0.414584,0.498516,0.057630,2.125726e-06,0.000000


In [11]:
# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


In [12]:

# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [13]:

# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns))
encoded_df = encoded_df[encoded_df.columns[1 + len(num_columns):]]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [14]:

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features =  encoder_kbd.fit_transform(df[num_columns].to_numpy())


encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [15]:
num_df

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,...,total_charges_robust,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,20.55,654.55,0.034546,0.434971,0.480648,0.049834,1.828756e-07,0.000000e+00,0.022088,0.381092,...,-0.222809,422.3025,13451.0025,4.284357e+05,8.678316e+03,2.764181e+05,8.804354e+06,2.804326e+08,0.0,0.0
1,24.70,780.20,0.024051,0.391047,0.517404,0.067486,1.131054e-05,0.000000e+00,0.019235,0.365307,...,-0.185762,610.0900,19270.9400,6.087120e+05,1.506922e+04,4.759922e+05,1.503519e+07,4.749171e+08,0.0,0.0
2,19.75,1375.40,0.036910,0.443198,0.473023,0.046869,3.308335e-08,0.000000e+00,0.009277,0.291028,...,-0.010275,390.0625,27164.1500,1.891725e+06,7.703734e+03,5.364920e+05,3.736157e+07,2.601879e+09,0.0,0.0
3,109.70,8129.30,0.000000,0.000044,0.080590,0.537616,3.629201e-01,1.882926e-02,0.000000,0.000011,...,1.981027,12034.0900,891784.2100,6.608552e+07,1.320140e+06,9.782873e+07,7.249581e+09,5.372290e+11,4.0,4.0
4,19.30,1192.70,0.038292,0.447781,0.468667,0.045260,7.943313e-09,0.000000e+00,0.011773,0.313568,...,-0.064142,372.4900,23019.1100,1.422533e+06,7.189057e+03,4.442688e+05,2.745489e+07,1.696655e+09,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,69.60,69.60,0.000000,0.038136,0.447273,0.469154,4.543660e-02,9.567869e-09,0.039747,0.452438,...,-0.395274,4844.1600,4844.1600,4.844160e+03,3.371535e+05,3.371535e+05,3.371535e+05,3.371535e+05,2.0,0.0
7015,104.15,7365.30,0.000000,0.000297,0.114096,0.572020,3.030312e-01,1.055559e-02,0.000000,0.000358,...,1.755771,10847.2225,767095.9950,5.424764e+07,1.129738e+06,7.989305e+07,5.649892e+09,3.995502e+11,4.0,4.0
7016,110.80,7245.90,0.000000,0.000026,0.074848,0.529357,3.748469e-01,2.092213e-02,0.000000,0.000507,...,1.720568,12276.6400,802845.7200,5.250307e+07,1.360252e+06,8.895531e+07,5.817340e+09,3.804320e+11,4.0,4.0
7017,80.15,385.00,0.000000,0.014508,0.334725,0.555308,9.533974e-02,1.203783e-04,0.029268,0.414584,...,-0.302282,6424.0225,30857.7500,1.482250e+05,5.148854e+05,2.473249e+06,1.188023e+07,5.706662e+07,3.0,0.0


In [24]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)

categorical_transformer = Pipeline(
    steps=[('encoder', encoder_oh)
    ]
)



In [22]:
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, num_columns),
    ('cat', categorical_transformer, cat_columns)
    ], 
    n_jobs=-1,
    remainder='drop',
    verbose_feature_names_out=False
)


preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), ('cat', categorical_transformer, cat_columns)], n_jobs=-1)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

df = pd.concat([df, transformed_df], axis=1)

In [34]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', RandomForestClassifier())])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df, df["target"], test_size=0.3, random_state=3)
model.fit(X_train, y_train)

In [40]:
prediction = model.predict(X_test)

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss
# импортируйте необходимые вам модули

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, err2, _ = confusion_matrix(y_test, prediction,normalize='all').ravel()
#auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
#metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss


In [41]:

pip_requirements = "requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]
metadata =  {'model_type': 'churn_month', 'preprocessing_version': '0.0.1'}

  inputs = _infer_schema(model_input) if model_input is not None else None


In [42]:

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

if not experiment_id:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) # ваш код здесь

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.sklearn.log_model( 
			sk_model=model,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            metadata=metadata,
            #code_path=code_paths,
            await_registration_for=60,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME)

    mlflow.log_metrics(metrics) 
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# получаем данные о запуске эксперимента по его уникальному идентификатору
run = mlflow.get_run(run_id) # ваш код здесь


# проверяем, что статус запуска эксперимента изменён на 'FINISHED'
# это утверждение (assert) можно использовать для автоматической проверки того, 
# что эксперимент был завершён успешно
assert (run.info.status =='FINISHED')# ваш код здесь


Registered model 'real_churn_model_Andrey' already exists. Creating a new version of this model...
2024/04/07 13:24:34 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: real_churn_model_Andrey, version 3
Created version '3' of model 'real_churn_model_Andrey'.


In [43]:
run_id

'21628b848ad643b4a1b93bbc7613bb22'

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model(
            sk_model=preprocessor,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            metadata=metadata,
            #code_path=code_paths,
            await_registration_for=60,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME
    )