In [3]:
import os
import mlflow
import pandas as pd
import psycopg
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [4]:
os.environ["DB_DESTINATION_HOST"] = os.getenv("DB_DESTINATION_HOST")
os.environ["DB_DESTINATION_PORT"] = os.getenv("DB_DESTINATION_PORT")
os.environ["DB_DESTINATION_NAME"] = os.getenv("DB_DESTINATION_NAME")
os.environ["DB_DESTINATION_USER"] = os.getenv("DB_DESTINATION_USER")
os.environ["DB_DESTINATION_PASSWORD"] = os.getenv("DB_DESTINATION_PASSWORD")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

# определяем глобальные переменные
# поднимаем MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000


registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)

# название тестового эксперимента и запуска (run) внутри него
EXPERIMENT_NAME = "real_churn_Andrey"
RUN_NAME = "real_churn_run"
REGISTRY_MODEL_NAME = "real_churn_model_Andrey"


In [5]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

# определим название таблицы, в которой хранятся наши данные.
TABLE_NAME = "clean_users_churn"

# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
                # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

                # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)

In [6]:
df

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,14,5918-VUKWP,2017-06-01,NaT,One year,No,Bank transfer (automatic),20.55,654.55,Fiber optic,...,No,No,No,No,Female,0,No,No,No,0
1,15,1744-JHKYS,2017-04-01,NaT,Month-to-month,No,Electronic check,24.70,780.20,Fiber optic,...,No,No,No,No,Female,0,Yes,No,Yes,0
2,16,2984-RGEYA,2014-05-01,NaT,Two year,No,Bank transfer (automatic),19.75,1375.40,Fiber optic,...,No,No,No,No,Female,0,Yes,Yes,No,0
3,17,9680-NIAUV,2014-02-01,NaT,Two year,No,Credit card (automatic),109.70,8129.30,Fiber optic,...,Yes,No,Yes,Yes,Female,0,Yes,Yes,Yes,0
4,18,2146-EGVDT,2015-03-01,NaT,Two year,No,Credit card (automatic),19.30,1192.70,Fiber optic,...,No,No,No,No,Male,0,Yes,Yes,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,7015,6502-MJQAE,2019-11-01,2019-12-01,Month-to-month,Yes,Electronic check,69.60,69.60,Fiber optic,...,No,No,No,No,Male,0,No,No,No,1
7015,7016,6257-DTAYD,2014-03-01,NaT,Two year,Yes,Credit card (automatic),104.15,7365.30,Fiber optic,...,No,Yes,Yes,Yes,Male,0,Yes,No,Yes,0
7016,7017,4616-ULAOA,2014-09-01,NaT,Two year,Yes,Credit card (automatic),110.80,7245.90,Fiber optic,...,Yes,Yes,Yes,Yes,Female,0,Yes,Yes,Yes,0
7017,7018,7693-LCKZL,2019-05-01,2019-10-01,Month-to-month,Yes,Electronic check,80.15,385.00,Fiber optic,...,No,No,No,No,Male,0,Yes,Yes,Yes,1


In [7]:
#import joblib
#with open('model_churn/fitted_model.pkl', 'rb') as f:
#        model=joblib.load(f)

In [8]:
with open("columns.txt", "w", encoding="utf-8") as fio:
    fio.write(','.join(df.columns.values.tolist()))
   

In [9]:
cat_features = df.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2
binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = df.select_dtypes(['float'])

preprocessor = ColumnTransformer(
        [
        ('binary', OneHotEncoder(), binary_cat_features.columns.tolist()),
        ('cat', CatBoostEncoder(return_df=False), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
        ],
        remainder='drop',
        verbose_feature_names_out=False
    )


df_trans = preprocessor.fit_transform(df, df['target']) 

In [10]:
#df = df.drop(columns=["is_apartment", "has_elevator"])
X_train, X_test, y_train, y_test = train_test_split(df_trans, df["target"], test_size=0.3, random_state=3)

In [11]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

Learning rate set to 0.02033
0:	learn: 0.6791140	total: 48.6ms	remaining: 48.5s
1:	learn: 0.6660682	total: 49.8ms	remaining: 24.9s
2:	learn: 0.6544251	total: 51.1ms	remaining: 17s
3:	learn: 0.6433733	total: 52.3ms	remaining: 13s
4:	learn: 0.6330659	total: 53.5ms	remaining: 10.7s
5:	learn: 0.6235688	total: 54.7ms	remaining: 9.07s
6:	learn: 0.6144955	total: 56ms	remaining: 7.95s
7:	learn: 0.6043269	total: 57.3ms	remaining: 7.11s
8:	learn: 0.5963296	total: 58.6ms	remaining: 6.45s
9:	learn: 0.5882368	total: 59.7ms	remaining: 5.91s
10:	learn: 0.5814931	total: 60.9ms	remaining: 5.48s
11:	learn: 0.5749679	total: 62.1ms	remaining: 5.12s
12:	learn: 0.5677809	total: 63.4ms	remaining: 4.81s
13:	learn: 0.5608318	total: 64.6ms	remaining: 4.55s
14:	learn: 0.5548484	total: 65.8ms	remaining: 4.32s
15:	learn: 0.5480879	total: 67ms	remaining: 4.12s
16:	learn: 0.5417553	total: 68.3ms	remaining: 3.95s
17:	learn: 0.5357738	total: 69.5ms	remaining: 3.79s
18:	learn: 0.5300746	total: 70.8ms	remaining: 3.65s
1

In [12]:

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss
# импортируйте необходимые вам модули

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, err2, _ = confusion_matrix(y_test, prediction,normalize='all').ravel()
#auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
#metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss


In [13]:
"""
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
metrics={}

metrics["mae"] = mean_absolute_error(y_test, prediction)
metrics["mape"] = mean_absolute_percentage_error(y_test, prediction)
"""

'\nfrom sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error\nmetrics={}\n\nmetrics["mae"] = mean_absolute_error(y_test, prediction)\nmetrics["mape"] = mean_absolute_percentage_error(y_test, prediction)\n'

In [14]:

pip_requirements = "requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]
metadata =  {'model_type': 'churn_month'}

In [15]:



experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

if not experiment_id:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) # ваш код здесь

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.catboost.log_model( 
			cb_model=model,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            metadata=metadata,
            #code_path=code_paths,
            await_registration_for=60,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME)

    mlflow.log_metrics(metrics) 
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# получаем данные о запуске эксперимента по его уникальному идентификатору
run = mlflow.get_run(run_id) # ваш код здесь


# проверяем, что статус запуска эксперимента изменён на 'FINISHED'
# это утверждение (assert) можно использовать для автоматической проверки того, 
# что эксперимент был завершён успешно
assert (run.info.status =='FINISHED')# ваш код здесь

# удаляем файлы 'columns.txt' и 'users_churn.csv' из файловой системы,
# чтобы очистить рабочую среду после логирования артефактов
os.remove('columns.txt') # ваш код здесь
#os.remove('users_churn.csv') # ваш код здесьcsv

Registered model 'real_churn_model_Andrey' already exists. Creating a new version of this model...
2024/03/29 16:26:21 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: real_churn_model_Andrey, version 2
Created version '2' of model 'real_churn_model_Andrey'.


In [16]:
loaded_model = mlflow.catboost.load_model(model_uri=model_info.model_uri)
model_predictions = loaded_model.predict(X_test)

#assert model_predictions.dtype == int

print(model_predictions[:10])

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

[0 0 1 0 0 1 0 0 0 1]


In [17]:
client = mlflow.MlflowClient(tracking_uri=tracking_uri, registry_uri=registry_uri)


models = client.search_model_versions(
    filter_string=f"name = '{REGISTRY_MODEL_NAME}'"
)
print(f"Model info:\n {models}")

model_name_1 = models[-1].name
model_version_1 = models[-1].version
model_stage_1 = models[-1].current_stage

model_name_2 = models[-2].name
model_version_2 = models[-2].version
model_stage_2 = models[-2].current_stage


print(f"Текущий stage модели 1: {model_stage_1}")
print(f"Текущий stage модели 2: {model_stage_2}")

# поменяйте статус каждой модели
client.transition_model_version_stage(
    model_name_1, model_version_1, 'production')
client.transition_model_version_stage(
    model_name_2, model_version_2, 'staging')

# переимнуйте модель в реестре
client.rename_registered_model(
    name = REGISTRY_MODEL_NAME, new_name = f'{REGISTRY_MODEL_NAME}_b2c')

Model info:
 [<ModelVersion: aliases=[], creation_timestamp=1711718780497, current_stage='None', description='', last_updated_timestamp=1711718780497, name='real_churn_model_Andrey', run_id='4b949d984942429c955db21f07f65485', run_link='', source='s3://s3-student-mle-20240228-2fd44f5a96/4/4b949d984942429c955db21f07f65485/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>, <ModelVersion: aliases=[], creation_timestamp=1711718085444, current_stage='None', description='', last_updated_timestamp=1711718085444, name='real_churn_model_Andrey', run_id='b97b26920b84410cb7bda513bd88da4f', run_link='', source='s3://s3-student-mle-20240228-2fd44f5a96/4/b97b26920b84410cb7bda513bd88da4f/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>]
Текущий stage модели 1: None
Текущий stage модели 2: None


RestException: RESOURCE_ALREADY_EXISTS: Registered Model (name=real_churn_model_Andrey_b2c) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "registered_model_pk"
DETAIL:  Key (name)=(real_churn_model_Andrey_b2c) already exists.

[SQL: UPDATE registered_models SET name=%(name)s WHERE registered_models.name = %(registered_models_name)s]
[parameters: {'name': 'real_churn_model_Andrey_b2c', 'registered_models_name': 'real_churn_model_Andrey'}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [18]:
print(f"Текущий stage модели 1: {model_stage_1}")
print(f"Текущий stage модели 2: {model_stage_2}")

Текущий stage модели 1: None
Текущий stage модели 2: None


In [20]:
EXPERIMENT_NAME = "real_churn_Andrey"
experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

In [34]:
experiment_runs = mlflow.search_runs(
    experiment_ids=[experiment_id],
).sort_values(by="start_time", ascending=False) 
experiment_runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.precision,metrics.f1,metrics.logloss,metrics.recall,metrics.err1,metrics.err2,tags.mlflow.source.name,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.source.git.commit,tags.mlflow.runName,tags.mlflow.source.type
0,4b949d984942429c955db21f07f65485,4,FINISHED,s3://s3-student-mle-20240228-2fd44f5a96/4/4b94...,2024-03-29 13:26:12.248000+00:00,2024-03-29 13:26:21.986000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,"[{""run_id"": ""4b949d984942429c955db21f07f65485""...",andrey,f9fee667dcf070cccb9686a1e443a7d487279b2e,real_churn_run,LOCAL
1,b97b26920b84410cb7bda513bd88da4f,4,FINISHED,s3://s3-student-mle-20240228-2fd44f5a96/4/b97b...,2024-03-29 13:14:38.166000+00:00,2024-03-29 13:14:47.382000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,"[{""run_id"": ""b97b26920b84410cb7bda513bd88da4f""...",andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
2,4616792e05cf4bea9eb54b80389a247d,4,FINISHED,s3://s3-student-mle-20240228-2fd44f5a96/4/4616...,2024-03-29 12:58:55.597000+00:00,2024-03-29 12:59:13.866000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,"[{""run_id"": ""4616792e05cf4bea9eb54b80389a247d""...",andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
3,f1108dab83ba4257b792daa42ea0aafc,4,FINISHED,s3://s3-student-mle-20240228-2fd44f5a96/4/f110...,2024-03-29 12:46:36.731000+00:00,2024-03-29 12:47:26.831000+00:00,0.65,0.570533,7.03416,0.50838,0.069801,0.12963,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,"[{""run_id"": ""f1108dab83ba4257b792daa42ea0aafc""...",andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
4,8d3bd5cdc8ba4dd3b2df51185993ef7c,4,FINISHED,s3://s3-student-mle-20240228-2fd44f5a96/4/8d3b...,2024-03-29 12:10:05.604000+00:00,2024-03-29 12:10:20.887000+00:00,0.255532,0.407049,26.776019,1.0,0.742877,0.254986,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,"[{""run_id"": ""8d3bd5cdc8ba4dd3b2df51185993ef7c""...",andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
5,2e4250fb78604ec19abd93c40b581f51,4,FAILED,s3://s3-student-mle-20240228-2fd44f5a96/4/2e42...,2024-03-29 12:05:24.460000+00:00,2024-03-29 12:05:26.314000+00:00,,,,,,,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,,andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
6,355e39b25e3b4798a33dd5a86bbb71ca,4,FAILED,s3://s3-student-mle-20240228-2fd44f5a96/4/355e...,2024-03-29 12:04:46.089000+00:00,2024-03-29 12:04:47.582000+00:00,,,,,,,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,,andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
7,1648c4c2f9e042989ec051cead75b4ab,4,FAILED,s3://s3-student-mle-20240228-2fd44f5a96/4/1648...,2024-03-29 12:04:32.556000+00:00,2024-03-29 12:04:33.914000+00:00,,,,,,,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,,andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL
8,4670a483ca2544de96ea188f2d10461a,4,FAILED,s3://s3-student-mle-20240228-2fd44f5a96/4/4670...,2024-03-29 11:57:26.663000+00:00,2024-03-29 11:57:27.378000+00:00,,,,,,,/home/andrey/work/MLE/mle-mlflow/.venv_mlflow/...,,andrey,97606f259f3836476c28ba5dbee9187602029cb0,real_churn_run,LOCAL


In [53]:
print( '"'+ '","'.join([col for col in experiment_runs.columns if 'metric' in col])+'"')
runs = experiment_runs[[
	"run_id", "start_time", "metrics.precision","metrics.f1","metrics.logloss","metrics.recall","metrics.err1","metrics.err2"
	
]].dropna()
runs

"metrics.precision","metrics.f1","metrics.logloss","metrics.recall","metrics.err1","metrics.err2"


Unnamed: 0,run_id,start_time,metrics.precision,metrics.f1,metrics.logloss,metrics.recall,metrics.err1,metrics.err2
0,4b949d984942429c955db21f07f65485,2024-03-29 13:26:12.248000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881
1,b97b26920b84410cb7bda513bd88da4f,2024-03-29 13:14:38.166000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881
2,4616792e05cf4bea9eb54b80389a247d,2024-03-29 12:58:55.597000+00:00,0.621212,0.567194,7.496258,0.521818,0.083096,0.124881
3,f1108dab83ba4257b792daa42ea0aafc,2024-03-29 12:46:36.731000+00:00,0.65,0.570533,7.03416,0.50838,0.069801,0.12963
4,8d3bd5cdc8ba4dd3b2df51185993ef7c,2024-03-29 12:10:05.604000+00:00,0.255532,0.407049,26.776019,1.0,0.742877,0.254986


'"metrics.precision","metrics.f1","metrics.logloss","metrics.recall","metrics.err1","metrics.err2"'