In [None]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

In [None]:
%cd ..

## Read data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
train_dataset = "data/train_preprocess.csv"
train_df = pd.read_csv(train_dataset)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
X = train_df.drop("label", axis=1)
y = train_df["label"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

## Train data

In [None]:
import os
import json
import mlflow
import datetime
from catboost import CatBoostClassifier

In [None]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

In [None]:
cat_features = [
    "type",
    "priority",
    "class",
    "is_return",
    "mailtype",
    "mailctg",
    "directctg",
    "postmark",
    "is_wrong_sndr_name",
    "is_wrong_rcpn_name",
    "is_wrong_phone_number",
    "is_wrong_address",
    "oper_type",
    "oper_attr",
]

In [None]:
ITERATIONS = 20
RANDOM_SEED = 42
LEARNING_RATE = 0.5

MODEL_NAME = 'catbost'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
LOG_DIR = f'log/catbost-{TIMESTAMP}'

PLOT_FILE = f'{LOG_DIR}/plot'
SNAPSHOT_FILE = f'{LOG_DIR}/snapshot'
MODELS_PATH = f'{LOG_DIR}/models'

In [None]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [None]:
mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [None]:
params ={
    "iterations": ITERATIONS,
    "batch_size":RANDOM_SEED,
    "num_epochs": LEARNING_RATE,
    "random_seed": RANDOM_SEED,
}
mlflow.log_params(params)

In [None]:
clf = CatBoostClassifier(
    iterations=ITERATIONS,
    random_seed=RANDOM_SEED,
    learning_rate=LEARNING_RATE,
    snapshot_file=SNAPSHOT_FILE,
    train_dir=LOG_DIR,
    custom_loss=['AUC', 'Recall', 'Precision', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=True,
)

In [None]:
best_score = clf.get_best_score()
logs = clf.get_evals_result()

In [None]:
with open(f'{LOG_DIR}/catboost_training.json') as f:
    catboost_training = json.load(f)

In [None]:
test_metric = [metric["name"] for metric in catboost_training["meta"]["test_metrics"]]
train_metric = [metric["name"] for metric in catboost_training["meta"]["learn_metrics"]]

In [None]:
for i, iteration in enumerate(catboost_training["iterations"]):
    log = dict()
    log["Epoch"] = i + 1
    for name, value in zip(train_metric, iteration["learn"]):
        log[f"Learn_{name}"] = value
    for name, value in zip(test_metric, iteration["test"]):
        log[f"Test_{name}"] = value
    mlflow.log_metrics(log)

mlflow.log_artifact(LOG_DIR)

In [None]:
mlflow.catboost.save_model(clf, MODELS_PATH)
mlflow.catboost.log_model(clf, MODELS_PATH)

In [None]:
mlflow.end_run()