# –ò–º–ø–æ—Ä—Ç –±–∏–±–ª–∏–æ—Ç–µ–∫

In [1]:
import mlflow
from mlflow.tracking import MlflowClient

import os

import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from transformers import pipeline
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

from datasets import Dataset

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import *

import joblib
import cloudpickle
import pickle

import nltk
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

from nltk.corpus import stopwords
from functools import lru_cache

from pymorphy3 import MorphAnalyzer
import re
import emoji
import torch

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Smart\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–π

In [None]:
from omegaconf import OmegaConf
import os

def load_config(config_name):
    """–ó–∞–≥—Ä—É–∑–∫–∞ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ –¥–ª—è –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞"""
    config_path = f"configs/{config_name}.yml"
    
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"–ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–æ–Ω–Ω—ã–π —Ñ–∞–π–ª {config_path} –Ω–µ –Ω–∞–π–¥–µ–Ω")
    
    cfg = OmegaConf.load(config_path)
    
    if 'MLFLOW_TRACKING_URI' in os.environ:
        cfg.mlflow.tracking_uri = os.environ['MLFLOW_TRACKING_URI']
    
    return cfg

# –≠–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç—ã —Å –¥–∞—Ç–∞—Å–µ—Ç–∞–º–∏

## –ü–µ—Ä–≤—ã–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (–õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è, —É–¥–∞–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏)

In [None]:
#!   mlflow server --host 127.0.0.1 --port 8080

In [None]:
cfg = load_config("base")

mlflow.set_tracking_uri(cfg.mlflow.tracking_uri)

print(f"Tracking URI: {cfg.mlflow.tracking_uri}")

Tracking URI: http://127.0.0.1:8080


In [None]:
cfg = load_config("preprocess_first")

analyzer = MorphAnalyzer(lang='ru')

stop_words = nltk.corpus.stopwords.words('russian')
stop_words_cleaned = [
    w for w in stop_words
    if w not in cfg.preprocess.keep_words
]

@lru_cache(maxsize=cfg.preprocess.lru_cache_size)
def lemmatization(text):
    return analyzer.parse(text)[0].normal_form

def preprocess_text(text):
    text = re.sub(cfg.preprocess.regex.remove_newlines, " ", text)

    text = re.sub(cfg.preprocess.regex.fix_mistyped_n, r"\1", text)

    text = re.sub(cfg.preprocess.regex.remove_symbols, "", text)

    text = re.sub(cfg.preprocess.regex.collapse_spaces, " ", text).strip()

    tokens = nltk.word_tokenize(text)
    result = []

    for token in tokens:
        if token not in stop_words_cleaned:
            result.append(lemmatization(token))

    return " ".join(result)

test_text = "–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ö–∞–∫–æ–µ –∂–µ –Ω–µ–ø—Ä–∏—è—Ç–Ω–æ–µ –º–µ—Å—Ç–æ, –Ω–µ—Ç?\n"
processed = preprocess_text(test_text)
print(f"–ò—Å—Ö–æ–¥–Ω—ã–π: {test_text}")
print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π: {processed}")

–ò—Å—Ö–æ–¥–Ω—ã–π: –í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ö–∞–∫–æ–µ –∂–µ –Ω–µ–ø—Ä–∏—è—Ç–Ω–æ–µ –º–µ—Å—Ç–æ, –Ω–µ—Ç?

–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π: –≤–µ—Å—å –ø—Ä–∏–≤–µ—Ç –∫–∞–∫–æ–π –Ω–µ–ø—Ä–∏—è—Ç–Ω—ã–π –º–µ—Å—Ç–æ –Ω–µ—Ç


In [None]:
with mlflow.start_run(run_name="First dataset"):

    mlflow.set_tag("Dataset_version", cfg.mlflow.dataset_version)

    annotation_dfs = [
        pd.read_json(path) for path in cfg.preprocess.input_files
    ]

    df_annotations = pd.concat(annotation_dfs)
    
    df = pd.DataFrame(columns=["span", "label"])

    for mark in df_annotations['aspect_sentiment']:
        for entry in mark:
            span = entry['text']
            label = entry['labels'][0]
            df.loc[len(df)] = [span, label]

    df['span'] = df['span'].apply(preprocess_text)

    display(df)

    df.to_csv(cfg.preprocess.output.dataset_csv, index=False)

    mlflow.log_artifact(cfg.preprocess.output.dataset_csv, "datasets")

    with open(cfg.preprocess.output.preprocess_pickle, "wb") as f:
        cloudpickle.dump(preprocess_text, f)

    mlflow.log_artifact(cfg.preprocess.output.preprocess_pickle, "functions")
    

Unnamed: 0,span,label
0,–≤–∫—É—Å —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ—Å—Ç—å —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –≥–æ–¥ 2 –Ω–∞–∑–∞–¥ —Å–æ–≤–µ—Ç–æ–≤–∞—Ç...,O
2,—Ö–æ—Ç–µ—Ç—å—Å—è –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –ø–æ—Å–ª–µ–¥–Ω–∏–π –≤—Ä–µ–º—è —Å—Ç–∞—Ç—å –æ—á...,O
3,–≤–∫—É—Å –∏–º–µ—Ç—å –∫–∞–∂–¥—ã–π 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã–π —Å—Ç–∞—Ç—å –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä—ã–π –æ–±—ã—á–Ω—ã–π,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç –ª—é–±–∏—Ç—å –Ω–æ–≤–æ–≥–æ–¥–Ω–∏–π...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–π –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–π,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–π –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–π –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã–π –ø–æ–ª–æ–º–∞—Ç—å –º–∏...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


üèÉ View run First dataset at: http://127.0.0.1:8080/#/experiments/0/runs/8690112afeac4c7faf0867c5d30d2dd9
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


## –í—Ç–æ—Ä–æ–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (–ü—Ä–æ—Å—Ç–∞—è –æ—á–∏—Å—Ç–∫–∞ –æ—Ç –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è –∏ –∏–∑–º–µ–Ω–µ–Ω–∏–µ —ç–º–æ–¥–∑–∏)

In [None]:
from functools import partial

cfg = load_config('preprocess_second')

def clean_text_only(text, cfg=None):
    """
    –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏
    
    Args:
        text: –í—Ö–æ–¥–Ω–æ–π —Ç–µ–∫—Å—Ç
        cfg: –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
    """
    if not isinstance(text, str):
        return ""
    
    lowercase = getattr(cfg, 'clean_only', {}).get('lowercase', True) if cfg else True
    replace_emoji = getattr(cfg, 'clean_only', {}).get('replace_emoji', True) if cfg else True
    remove_punctuation = getattr(cfg, 'clean_only', {}).get('remove_punctuation', True) if cfg else True
    remove_special_chars = getattr(cfg, 'clean_only', {}).get('remove_special_chars', True) if cfg else True
    collapse_spaces = getattr(cfg, 'clean_only', {}).get('collapse_spaces', True) if cfg else True
    
    if lowercase:
        text = text.lower()
    
    if replace_emoji:
        text = emoji.demojize(text, delimiters=(" ", " "))
    
    if remove_special_chars:
        text = re.sub(r'[\n\r\t]', ' ', text)
    
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', ' ', text)
    
    if collapse_spaces:
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def clean_text_only_legacy(text):
    """–õ–µ–≥–∞—Å–∏ –≤–µ—Ä—Å–∏—è –¥–ª—è –æ–±—Ä–∞—Ç–Ω–æ–π —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏"""
    return clean_text_only(text)


with mlflow.start_run(run_name='Second dataset'):
    
    mlflow.set_tag("Dataset_version", cfg.mlflow.dataset_version)

    annotation_dfs = [
        pd.read_json(path) for path in cfg.preprocess.input_files
    ]

    df_annotations = pd.concat(annotation_dfs)
    
    df = pd.DataFrame(columns=["span", "label"])

    for mark in df_annotations['aspect_sentiment']:
        for entry in mark:
            span = entry['text']
            label = entry['labels'][0]
            df.loc[len(df)] = [span, label]

    df['span'] = df['span'].apply(lambda x: clean_text_only(x, cfg))
    
    display(df)

    df.to_csv(cfg.preprocess.output.dataset_csv, index=False)

    mlflow.log_artifact(cfg.preprocess.output.dataset_csv, "datasets")

    clean_text_with_config = partial(clean_text_only, cfg=cfg)
    with open(cfg.preprocess.output.preprocess_pickle, "wb") as f:
        cloudpickle.dump(clean_text_with_config, f)

    mlflow.log_artifact(cfg.preprocess.output.preprocess_pickle, "functions")

Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å–æ...,O
2,—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ...,O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç —è –ª—é–±–ª—é –Ω–æ–≤–æ–≥–æ–¥–Ω–µ...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–µ –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º–∏ —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–µ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–µ –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–µ –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã—Ö –ø–æ–ª–æ–º–∞–Ω–Ω—ã—Ö ...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


üèÉ View run Second dataset at: http://127.0.0.1:8080/#/experiments/0/runs/ba68188717234d8ea8f01745d4816ea8
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


## –¢—Ä–µ—Ç–∏–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (–ê—É–≥–º–µ–Ω—Ç–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –º–∏–Ω–æ—Ä–Ω—ã—Ö –∫–ª–∞—Å—Å–æ–≤)

In [8]:
with mlflow.start_run(run_name="Third dataset"):

    cfg = load_config("preprocess_third")

    mlflow.set_tag("Dataset_version", cfg.mlflow.dataset_version)


    annotation_dfs = [pd.read_json(path) for path in cfg.preprocess.input_files]
    df_annotations = pd.concat(annotation_dfs, ignore_index=True)


    df = pd.DataFrame(columns=["span", "label"])

    for mark in df_annotations['aspect_sentiment']:
        for entry in mark:
            span = entry['text']
            label = entry['labels'][0]
            df.loc[len(df)] = [span, label]


    class_counts = df['label'].value_counts()
    minority_classes = class_counts[class_counts < class_counts.mean()].index.tolist()

    print(f"Minority classes: {minority_classes}")


    import random

    synonyms = cfg.preprocess.augmentation.synonyms
    n_variants = cfg.preprocess.augmentation.n_variants

    def simple_augmentation(text):
        augmented = []
        for _ in range(n_variants):
            words = text.split()
            new_words = []
            for w in words:
                wl = w.lower()
                if wl in synonyms and random.random() > 0.7:
                    new_words.append(random.choice(synonyms[wl]))
                else:
                    new_words.append(w)
            augmented.append(" ".join(new_words))
        return augmented


    augmented_data = []
    for label in minority_classes:
        samples = df[df['label'] == label]
        for _, row in samples.iterrows():
            for aug_text in simple_augmentation(row["span"]):
                augmented_data.append({"span": aug_text, "label": label})

    df_augmented = pd.DataFrame(augmented_data)


    df_extended = pd.concat([df, df_augmented], ignore_index=True)


    output_path = cfg.preprocess.output.dataset_csv
    df_extended.to_csv(output_path, index=False)

    mlflow.log_artifact(output_path, "datasets")

    print(f"–°–æ—Ö—Ä–∞–Ω–µ–Ω –¥–∞—Ç–∞—Å–µ—Ç —Å –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–µ–π: {output_path}")


Minority classes: ['–ü–ê–ß–ö–ê_POSITIVE', '–¢–ï–ö–°–¢–£–†–ê_NEUTRAL', '–ü–ê–ß–ö–ê_NEUTRAL', '–¢–ï–ö–°–¢–£–†–ê_NEGATIVE', '–ü–ê–ß–ö–ê_NEGATIVE']
–°–æ—Ö—Ä–∞–Ω–µ–Ω –¥–∞—Ç–∞—Å–µ—Ç —Å –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–µ–π: third_experiment_dataset.csv
üèÉ View run Third dataset at: http://127.0.0.1:8080/#/experiments/0/runs/327f7f34ba914fce83ca6c0e787e8f15
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


# –≠–∫—Å–ø–µ—Ä–µ–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Å –º–æ–¥–µ–ª—è–º–∏

## –ü–µ—Ä–≤—ã–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (–ü—Ä–æ—Å—Ç–∞—è –º–æ–¥–µ–ª—å)

–ë—É–¥–µ–º –æ–±—É—á–∞—Ç—å –ø—Ä–æ—Å—Ç—É—é –º–æ–¥–µ–ª—å: "–ù–∞–∏–≤–Ω—ã–π –ë–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ö–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä" –∏–∑ `Sklearn`, –±—É–¥–µ–º –ø—Ä–æ–≤–æ–¥–∏—Ç—å —Ç–µ—Å—Ç –Ω–∞ 3 –≤–µ—Ä—Å–∏—è—Ö –¥–∞—Ç–∞—Å–µ—Ç–∞ –∏ –ø–æ–π–º–µ–º, –∫–∞–∫–∞—è –º–æ–¥–µ–ª—å –ª—É—á—à–µ —Å–µ–±—è –ø–æ–∫–∞–∂–µ—Ç –ø—Ä–∏ —Ä–∞–±–æ—Ç–µ —Å —Ç–µ–º –∏–ª–∏ –∏–Ω—ã–º –¥–∞—Ç–∞—Å–µ—Ç–æ–º

### –ü–µ—Ä–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("naive_bayes_first")

with mlflow.start_run(run_name='first_model_experiment'):

    mlflow.set_tag('NaiveBayes', cfg.model.version)

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    client = MlflowClient()

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    first_dataset_latest_run = dataset_runs[0]
    first_dataset_latest_run_id = first_dataset_latest_run.info.run_id

    # –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø—É—Ç–∏ –∏–∑ –∫–æ–Ω—Ñ–∏–≥–∞
    full_dataset_path = f"{cfg.data.dataset_path}/{cfg.data.dataset_file}"
    
    try:
        dataframe_path = client.download_artifacts(first_dataset_latest_run_id, full_dataset_path)
        df = pd.read_csv(dataframe_path)
        print(f"‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω: {full_dataset_path}")
    except Exception as e:
        print(f"‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å {full_dataset_path}: {e}")

        try:
            dataframe_path = client.download_artifacts(first_dataset_latest_run_id, cfg.data.dataset_file)
            df = pd.read_csv(dataframe_path)
            print(f"‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω: {cfg.data.dataset_file}")
        except:
            raise ValueError(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç: {full_dataset_path}")
    
    display(df)

    # =====================================================================================================================================
    #                                              –í–ï–ö–¢–û–†–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    vectorizer = TfidfVectorizer(
        lowercase=cfg.vectorizer.lowercase,
        analyzer=cfg.vectorizer.analyzer,
        max_features=cfg.vectorizer.max_features,
        ngram_range=tuple(cfg.vectorizer.ngram_range),
        min_df=cfg.vectorizer.min_df,
        max_df=cfg.vectorizer.max_df
    )
    
    encoder = LabelEncoder()

    # =====================================================================================================================================
    #                                         –†–ê–ó–ë–ò–ï–ù–ò–ï –ù–ê TRAIN/TEST
    # =====================================================================================================================================

    df_train, df_test = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['label']
    )

    X_train = df_train['span']
    X_train = vectorizer.fit_transform(X_train)

    y_train = df_train['label']
    y_train = encoder.fit_transform(y_train)

    X_test = df_test['span']
    X_test = vectorizer.transform(X_test)

    y_test = df_test['label']
    y_test = encoder.transform(y_test)

    # =====================================================================================================================================
    #                                         –û–ë–£–ß–ï–ù–ò–ï –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================


    mlflow.sklearn.autolog(disable=True)
    
    model = MultinomialNB()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    print(f'f1-score —Ä–∞–≤–µ–Ω {f1:.4f}')
    print(f'accuracy-score —Ä–∞–≤–µ–Ω {accuracy:.4f}')

    joblib.dump(model, cfg.model.artifacts.model)
    joblib.dump(vectorizer, cfg.model.artifacts.vectorizer)
    joblib.dump(encoder, cfg.model.artifacts.encoder)
    
    mlflow.log_artifact(cfg.model.artifacts.model, 'models')
    mlflow.log_artifact(cfg.model.artifacts.vectorizer, 'models')
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'models')
    
    mlflow.log_metrics({
        'f1_score': f1,
        'accuracy': accuracy
    })

    mlflow.log_params({
        'vectorizer_max_features': cfg.vectorizer.max_features,
        'vectorizer_ngram_range': str(cfg.vectorizer.ngram_range),
        'test_size': cfg.training.test_size
    })

    print("–ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.60it/s]


‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω: datasets/first_experiment_dataset.csv


Unnamed: 0,span,label
0,–≤–∫—É—Å —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ—Å—Ç—å —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –≥–æ–¥ 2 –Ω–∞–∑–∞–¥ —Å–æ–≤–µ—Ç–æ–≤–∞—Ç...,O
2,—Ö–æ—Ç–µ—Ç—å—Å—è –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –ø–æ—Å–ª–µ–¥–Ω–∏–π –≤—Ä–µ–º—è —Å—Ç–∞—Ç—å –æ—á...,O
3,–≤–∫—É—Å –∏–º–µ—Ç—å –∫–∞–∂–¥—ã–π 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã–π —Å—Ç–∞—Ç—å –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä—ã–π –æ–±—ã—á–Ω—ã–π,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç –ª—é–±–∏—Ç—å –Ω–æ–≤–æ–≥–æ–¥–Ω–∏–π...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–π –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–π,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–π –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–π –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã–π –ø–æ–ª–æ–º–∞—Ç—å –º–∏...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


f1-score —Ä–∞–≤–µ–Ω 0.3869
accuracy-score —Ä–∞–≤–µ–Ω 0.4422
–ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run first_model_experiment at: http://127.0.0.1:8080/#/experiments/0/runs/64117c4a566b416fb2c5adeafb5de204
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_bayes_first")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================


latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")


vectorizer_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.vectorizer}"
bayes_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.bayes}" 
encoder_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.encoder}"

try:
    vectorizer_file = client.download_artifacts(latest_run_model_id, vectorizer_path)
    bayes_file = client.download_artifacts(latest_run_model_id, bayes_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    print("‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")

    try:
        vectorizer_file = client.download_artifacts(latest_run_model_id, cfg_inference.model.vectorizer)
        bayes_file = client.download_artifacts(latest_run_model_id, cfg_inference.model.bayes)
        encoder_file = client.download_artifacts(latest_run_model_id, cfg_inference.model.encoder)
        print("‚úÖ –ú–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã (fallback)")
    except:
        raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–æ–¥–µ–ª–∏")

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –§–£–ù–ö–¶–ò–ò –ü–†–ï–ü–†–û–¶–ï–°–°–ò–ù–ì–ê
# ===========================================================================================


latest_run_dataset = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.preprocess.run_name}"',
    order_by=['attributes.end_time desc']
)

latest_run_dataset_id = latest_run_dataset[0].info.run_id

try:
    art_loc = client.download_artifacts(latest_run_dataset_id, cfg_inference.preprocess.artifact_path)
    print("‚úÖ –§—É–Ω–∫—Ü–∏—è –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
except Exception as e:
    print(f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ —Ñ—É–Ω–∫—Ü–∏–∏: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================


vectorizer = joblib.load(vectorizer_file)
bayes = joblib.load(bayes_file)
encoder = joblib.load(encoder_file)


with open(art_loc, 'rb') as f:
    preprocess_func = cloudpickle.load(f)

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

preprocessed_text = preprocess_func(text)
done_text = vectorizer.transform([preprocessed_text])
label = bayes.predict(done_text)

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "{preprocessed_text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {encoder.inverse_transform(label)[0]}')

Run ID –º–æ–¥–µ–ª–∏: 64117c4a566b416fb2c5adeafb5de204


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  8.27it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  6.86it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 20.86it/s]


‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 29.66it/s]

‚úÖ –§—É–Ω–∫—Ü–∏—è –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–ú–µ–Ω—è –ø—Ä–∏–≤–ª–µ–∫–ª–æ —Ç–∞–∫–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ"
–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "—è –ø—Ä–∏–≤–ª–µ—á—å —Ç–∞–∫–æ–π —Å–æ—á–µ—Ç–∞–Ω–∏–µ"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_POSITIVE





### –í—Ç–æ—Ä–æ–π –¥–∞—Ç–∞—Å–µ—Ç

In [11]:

cfg = load_config("naive_bayes_second")

with mlflow.start_run(run_name='second_model_experiment'):

    mlflow.set_tag('NaiveBayes', cfg.model.version)

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    client = MlflowClient()

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    first_dataset_latest_run = dataset_runs[0]
    first_dataset_latest_run_id = first_dataset_latest_run.info.run_id


    try:
        dataframe_path = client.download_artifacts(first_dataset_latest_run_id, "datasets/second_experiment_dataset.csv")
        df = pd.read_csv(dataframe_path)
        print("‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é")
    except:
        try:
            files = client.list_artifacts(first_dataset_latest_run_id, cfg.data.dataset_path)
            dataframe = files[0].path
            dataframe_path = client.download_artifacts(first_dataset_latest_run_id, dataframe)
            df = pd.read_csv(dataframe_path)
            print("‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω —á–µ—Ä–µ–∑ list_artifacts")
        except:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df)

    # =====================================================================================================================================
    #                                              –í–ï–ö–¢–û–†–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    vectorizer = TfidfVectorizer(
        lowercase=cfg.vectorizer.lowercase,
        analyzer=cfg.vectorizer.analyzer,
        max_features=cfg.vectorizer.max_features,
        ngram_range=tuple(cfg.vectorizer.ngram_range),
        min_df=cfg.vectorizer.min_df,
        max_df=cfg.vectorizer.max_df
    )
    
    encoder = LabelEncoder()

    # =====================================================================================================================================
    #                                         –†–ê–ó–ë–ò–ï–ù–ò–ï –ù–ê TRAIN/TEST
    # =====================================================================================================================================

    df_train, df_test = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['label']
    )

    X_train = df_train['span']
    X_train = vectorizer.fit_transform(X_train)

    y_train = df_train['label']
    y_train = encoder.fit_transform(y_train)

    X_test = df_test['span']
    X_test = vectorizer.transform(X_test)

    y_test = df_test['label']
    y_test = encoder.transform(y_test)

    # =====================================================================================================================================
    #                                         –û–ë–£–ß–ï–ù–ò–ï –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    mlflow.sklearn.autolog(disable=True)

    model = MultinomialNB()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    print(f'f1-score —Ä–∞–≤–µ–Ω {f1:.4f}')
    print(f'accuracy-score —Ä–∞–≤–µ–Ω {accuracy:.4f}')

    joblib.dump(model, cfg.model.artifacts.model)
    joblib.dump(vectorizer, cfg.model.artifacts.vectorizer)
    joblib.dump(encoder, cfg.model.artifacts.encoder)
    
    mlflow.log_artifact(cfg.model.artifacts.model, 'models')
    mlflow.log_artifact(cfg.model.artifacts.vectorizer, 'models')
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'models')
    
    mlflow.log_metrics({
        'f1_score': f1,
        'accuracy': accuracy
    })

    mlflow.log_params({
        'vectorizer_max_features': cfg.vectorizer.max_features,
        'vectorizer_ngram_range': str(cfg.vectorizer.ngram_range),
        'test_size': cfg.training.test_size
    })

    print("–í—Ç–æ—Ä–∞—è –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.38it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é


Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å–æ...,O
2,—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ...,O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç —è –ª—é–±–ª—é –Ω–æ–≤–æ–≥–æ–¥–Ω–µ...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–µ –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º–∏ —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–µ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–µ –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–µ –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã—Ö –ø–æ–ª–æ–º–∞–Ω–Ω—ã—Ö ...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


f1-score —Ä–∞–≤–µ–Ω 0.3675
accuracy-score —Ä–∞–≤–µ–Ω 0.4223
–í—Ç–æ—Ä–∞—è –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run second_model_experiment at: http://127.0.0.1:8080/#/experiments/0/runs/2414d58064a146a69e694625b0041c82
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_bayes_second")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================


latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"', 
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")


vectorizer_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.vectorizer}"
bayes_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.bayes}"
encoder_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.encoder}"

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")
try:
    vectorizer_file = client.download_artifacts(latest_run_model_id, vectorizer_path)
    bayes_file = client.download_artifacts(latest_run_model_id, bayes_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    print("–ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –§–£–ù–ö–¶–ò–ò –ü–†–ï–ü–†–û–¶–ï–°–°–ò–ù–ì–ê
# ===========================================================================================


latest_run_dataset = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.preprocess.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_dataset:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.preprocess.run_name}")

latest_run_dataset_id = latest_run_dataset[0].info.run_id

print("–ó–∞–≥—Ä—É–∂–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞...")
try:
    art_loc = client.download_artifacts(latest_run_dataset_id, cfg_inference.preprocess.artifact_path)
    print("–§—É–Ω–∫—Ü–∏—è –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ —Ñ—É–Ω–∫—Ü–∏–∏: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
vectorizer = joblib.load(vectorizer_file)
bayes = joblib.load(bayes_file)
encoder = joblib.load(encoder_file)

print("–ó–∞–≥—Ä—É–∂–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞...")
with open(art_loc, 'rb') as f:
    preprocess_func = cloudpickle.load(f)

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")
preprocessed_text = preprocess_func(text)
done_text = vectorizer.transform([preprocessed_text])
label = bayes.predict(done_text)

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "{preprocessed_text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {encoder.inverse_transform(label)[0]}')

Run ID –º–æ–¥–µ–ª–∏: 2414d58064a146a69e694625b0041c82
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.57it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  8.45it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 22.97it/s]


‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 43.05it/s]

‚úÖ –§—É–Ω–∫—Ü–∏—è –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
–ó–∞–≥—Ä—É–∂–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–∞...
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–ú–µ–Ω—è –Ω–µ –ø—Ä–∏–≤–ª–µ–∫–ª–æ —Ç–∞–∫–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ üò≠"
–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "–º–µ–Ω—è –Ω–µ –ø—Ä–∏–≤–ª–µ–∫–ª–æ —Ç–∞–∫–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ loudly_crying_face"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_NEGATIVE





### –¢—Ä–µ—Ç–∏–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("naive_bayes_third")

with mlflow.start_run(run_name='third_model_experiment'):

    mlflow.set_tag('NaiveBayes', cfg.model.version)

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    client = MlflowClient()

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"–ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")


    try:
        dataframe_path = client.download_artifacts(dataset_run_id, f"datasets/{cfg.data.dataset_file}")
        df = pd.read_csv(dataframe_path)
        print("–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        alternative_paths = [
            cfg.data.dataset_file,
            f"artifacts/datasets/{cfg.data.dataset_file}",
            "third_experiment_dataset.csv"
        ]
        
        for path in alternative_paths:
            try:
                dataframe_path = client.download_artifacts(dataset_run_id, path)
                df = pd.read_csv(dataframe_path)
                print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                break
            except:
                continue
        else:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df)

    # =====================================================================================================================================
    #                                              –í–ï–ö–¢–û–†–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    vectorizer = TfidfVectorizer(
        lowercase=cfg.vectorizer.lowercase,
        analyzer=cfg.vectorizer.analyzer,
        max_features=cfg.vectorizer.max_features,
        ngram_range=tuple(cfg.vectorizer.ngram_range),
        min_df=cfg.vectorizer.min_df,
        max_df=cfg.vectorizer.max_df
    )
    
    encoder = LabelEncoder()

    # =====================================================================================================================================
    #                                         –†–ê–ó–ë–ò–ï–ù–ò–ï –ù–ê TRAIN/TEST
    # =====================================================================================================================================

    df_train, df_test = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['label']
    )

    X_train = df_train['span']
    X_train = vectorizer.fit_transform(X_train)

    y_train = df_train['label']
    y_train = encoder.fit_transform(y_train)

    X_test = df_test['span']
    X_test = vectorizer.transform(X_test)

    y_test = df_test['label']
    y_test = encoder.transform(y_test)

    # =====================================================================================================================================
    #                                         –û–ë–£–ß–ï–ù–ò–ï –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    mlflow.sklearn.autolog(disable=True)

    model = MultinomialNB()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    print(f'f1-score —Ä–∞–≤–µ–Ω {f1:.4f}')
    print(f'accuracy-score —Ä–∞–≤–µ–Ω {accuracy:.4f}')

    joblib.dump(model, cfg.model.artifacts.model)
    joblib.dump(vectorizer, cfg.model.artifacts.vectorizer)
    joblib.dump(encoder, cfg.model.artifacts.encoder)
    
    mlflow.log_artifact(cfg.model.artifacts.model, 'models')
    mlflow.log_artifact(cfg.model.artifacts.vectorizer, 'models')
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'models')
    
    mlflow.log_metrics({
        'f1_score': f1,
        'accuracy': accuracy
    })

    mlflow.log_params({
        'vectorizer_max_features': cfg.vectorizer.max_features,
        'vectorizer_ngram_range': str(cfg.vectorizer.ngram_range),
        'test_size': cfg.training.test_size
    })

    print("–¢—Ä–µ—Ç—å—è –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: 327f7f34ba914fce83ca6c0e787e8f15


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.73it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é


Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,"–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...",O
2,"—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ, –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä...",O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –ù–ê–ú–ù–û–ì–û –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE
...,...,...
4015,"—Å —á–∏–ø—Å–∞–º–∏ –≤—Å–µ–≥–¥–∞ —Å—Ç–æ–∏—Ç –ø–æ–º–Ω–∏—Ç—å, —á—Ç–æ –¥–≤–µ —Ç—Ä–µ—Ç–∏ ...",–ü–ê–ß–ö–ê_NEGATIVE
4016,"–ü–ª–∞—Å—Ç–∏–∫–æ–≤–∞—è –∫—Ä—ã—à–∫–∞ –ø–æ–ø–∞–ª–∞—Å—å –æ—á–µ–Ω—å —Ç—É–≥–∞—è, –µ–ª–µ-–µ...",–ü–ê–ß–ö–ê_NEGATIVE
4017,"–ü–ª–∞—Å—Ç–∏–∫–æ–≤–∞—è –∫—Ä—ã—à–∫–∞ –ø–æ–ø–∞–ª–∞—Å—å –æ—á–µ–Ω—å —Ç—É–≥–∞—è, –µ–ª–µ-–µ...",–ü–ê–ß–ö–ê_NEGATIVE
4018,–ù–∞–ø–æ–ª–Ω–µ–∏–Ω–µ –ø–∞—á–∫–∏ —Ç–∏–ø–∏—á–Ω–æ–µ –¥–ª—è –¥–∞–Ω–Ω–æ–≥–æ –ø—Ä–æ–∏–∑–≤–æ–¥...,–ü–ê–ß–ö–ê_NEGATIVE


f1-score —Ä–∞–≤–µ–Ω 0.5517
accuracy-score —Ä–∞–≤–µ–Ω 0.5958
–¢—Ä–µ—Ç—å—è –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run third_model_experiment at: http://127.0.0.1:8080/#/experiments/0/runs/b5cc3981bcdd47d2ae00e73f194e41d0
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ—ã

In [None]:
cfg_inference = load_config("inference_bayes_third")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================


latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")


vectorizer_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.vectorizer}"
bayes_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.bayes}"
encoder_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.encoder}"

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")
try:
    vectorizer_file = client.download_artifacts(latest_run_model_id, vectorizer_path)
    bayes_file = client.download_artifacts(latest_run_model_id, bayes_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    print("–ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise


# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
vectorizer = joblib.load(vectorizer_file)
bayes = joblib.load(bayes_file)
encoder = joblib.load(encoder_file)

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")
done_text = vectorizer.transform([text])
label = bayes.predict(done_text)

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "{preprocessed_text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {encoder.inverse_transform(label)[0]}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: b5cc3981bcdd47d2ae00e73f194e41d0
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  8.34it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.17it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 22.70it/s]


‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–ú–µ–Ω—è –ø—Ä–∏–≤–ª–µ–∫–ª–æ —Ç–∞–∫–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ"
–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: "–º–µ–Ω—è –Ω–µ –ø—Ä–∏–≤–ª–µ–∫–ª–æ —Ç–∞–∫–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ loudly_crying_face"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –ü–ê–ß–ö–ê_POSITIVE


### –ò—Ç–æ–≥

In [15]:
results = pd.DataFrame({
    '–í–µ—Ä—Å–∏—è': ['–í–µ—Ä—Å–∏—è 1', '–í–µ—Ä—Å–∏—è 2', '–í–µ—Ä—Å–∏—è 3'],
    'F1-Score': [0.363707, 0.344306, 0.344139],
    'Accuracy': [0.413043, 0.391304, 0.391304],
    '–û–ø–∏—Å–∞–Ω–∏–µ': ['–õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è, —É–¥–∞–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏', '–û—á–∏—Å—Ç–∫–∞ –æ—Ç –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è –∏ emoji to text', '–ù–∏–∫–∞–∫–æ–π –æ—á–∏—Å—Ç–∫–∏']
})

results

Unnamed: 0,–í–µ—Ä—Å–∏—è,F1-Score,Accuracy,–û–ø–∏—Å–∞–Ω–∏–µ
0,–í–µ—Ä—Å–∏—è 1,0.363707,0.413043,"–õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è, —É–¥–∞–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏"
1,–í–µ—Ä—Å–∏—è 2,0.344306,0.391304,–û—á–∏—Å—Ç–∫–∞ –æ—Ç –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è –∏ emoji to text
2,–í–µ—Ä—Å–∏—è 3,0.344139,0.391304,–ù–∏–∫–∞–∫–æ–π –æ—á–∏—Å—Ç–∫–∏


## –í—Ç–æ—Ä–æ–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (Keras)

### –ü–µ—Ä–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:

cfg = load_config("neural_network_first")

with mlflow.start_run(run_name='first_experiment_neural_network'):
    
    mlflow.set_tag('LSTM', cfg.model.version)

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    client = MlflowClient()

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")

    try:
        dataframe_path = client.download_artifacts(dataset_run_id, f"{cfg.data.dataset_path}/{cfg.data.dataset_file}")
        df = pd.read_csv(dataframe_path)
        print("–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        try:
            files = client.list_artifacts(dataset_run_id, cfg.data.dataset_path)
            dataframe = files[0].path
            dataframe_path = client.download_artifacts(dataset_run_id, dataframe)
            df = pd.read_csv(dataframe_path)
            print("–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω —á–µ—Ä–µ–∑ list_artifacts")
        except Exception as e2:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ —á–µ—Ä–µ–∑ list_artifacts: {e2}")
            alternative_paths = [
                cfg.data.dataset_file,
                f"artifacts/{cfg.data.dataset_path}/{cfg.data.dataset_file}",
                "First_version.csv"
            ]
            
            for path in alternative_paths:
                try:
                    dataframe_path = client.download_artifacts(dataset_run_id, path)
                    df = pd.read_csv(dataframe_path)
                    print(f"‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                    break
                except:
                    continue
            else:
                raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df)
    
    # =====================================================================================================================================
    #                                              –í–ï–ö–¢–û–†–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    mlflow.tensorflow.autolog()

    tokenizer = Tokenizer(
        num_words=cfg.tokenizer.num_words,
        oov_token=cfg.tokenizer.oov_token,
        filters=cfg.tokenizer.filters,
        lower=cfg.tokenizer.lower,
        split=cfg.tokenizer.split,
        char_level=cfg.tokenizer.char_level
    )

    tokenizer.fit_on_texts(df['span'])

    df_train, df_temp = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['label']
    )

    df_test, df_val = train_test_split(
        df_temp, 
        test_size=cfg.training.val_size, 
        random_state=cfg.training.random_state, 
        stratify=df_temp['label']
    )

    X_train_vec = tokenizer.texts_to_sequences(df_train['span'])
    X_test_vec = tokenizer.texts_to_sequences(df_test['span'])
    X_val_vec = tokenizer.texts_to_sequences(df_val['span'])

    max_len_text = 0
    for i in df['span']:
        max_len_text = max(max_len_text, len(i.split(' ')))
    print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞: {max_len_text}")


    X_train_pad = pad_sequences(X_train_vec, max_len_text, padding='post', truncating='post')
    X_test_pad = pad_sequences(X_test_vec, max_len_text, padding='post', truncating='post')
    X_val_pad = pad_sequences(X_val_vec, max_len_text, padding='post', truncating='post')

    # =====================================================================================================================================
    #                                              –ü–û–î–ì–û–¢–û–í–ö–ê –¢–ê–†–ì–ï–¢–û–í
    # =====================================================================================================================================

    encoder = LabelEncoder()

    y_train = encoder.fit_transform(df_train['label'])
    y_test = encoder.transform(df_test['label'])
    y_val = encoder.transform(df_val['label'])

    # =====================================================================================================================================
    #                                              –°–û–ó–î–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    model = Sequential()

    model.add(Embedding(
        input_dim=cfg.tokenizer.num_words, 
        output_dim=cfg.model.embedding_dim, 
        input_length=max_len_text
    ))

    model.add(Bidirectional(LSTM(
        cfg.model.lstm_units, 
        dropout=0.2, 
        recurrent_dropout=0.3
    )))

    model.add(Dropout(cfg.model.dropout_rate))

    model.add(Dense(cfg.model.dense_units, activation='softmax'))

    model.compile(
        optimizer='adam', 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )

    model.fit(
        X_train_pad, 
        y_train, 
        batch_size=cfg.training.batch_size, 
        epochs=cfg.training.epochs, 
        validation_data=(X_val_pad, y_val)
    )

    test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f}")

    model.summary()

    # =====================================================================================================================================
    #                                         –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    pred_proba = model.predict(X_test_pad)
    pred_class = np.argmax(pred_proba, axis=1)
    
    f1 = f1_score(pred_class, y_test, average='weighted')
    accuracy = accuracy_score(pred_class, y_test)

    print(f'f1-score —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω {f1:.4f}')
    print(f'accuracy —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω {accuracy:.4f}')

    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('test_accuracy', test_accuracy)
    mlflow.log_metric('test_loss', test_loss)

    model.save("LSTM_ver_1.keras")

    with open(cfg.model.artifacts.tokenizer, 'wb') as f:
        cloudpickle.dump(tokenizer, f)
    
    with open(cfg.model.artifacts.encoder, 'wb') as f:
        cloudpickle.dump(encoder, f)

    mlflow.log_artifact(cfg.model.artifacts.model, 'models')
    mlflow.log_artifact(cfg.model.artifacts.tokenizer, 'models') 
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'models')

    mlflow.log_params({
        'num_words': cfg.tokenizer.num_words,
        'embedding_dim': cfg.model.embedding_dim,
        'lstm_units': cfg.model.lstm_units,
        'dropout_rate': cfg.model.dropout_rate,
        'batch_size': cfg.training.batch_size,
        'epochs': cfg.training.epochs
    })

    print("–ù–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: 8690112afeac4c7faf0867c5d30d2dd9


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.66it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –Ω–∞–ø—Ä—è–º—É—é


Unnamed: 0,span,label
0,–≤–∫—É—Å —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ—Å—Ç—å —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –≥–æ–¥ 2 –Ω–∞–∑–∞–¥ —Å–æ–≤–µ—Ç–æ–≤–∞—Ç...,O
2,—Ö–æ—Ç–µ—Ç—å—Å—è –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –ø–æ—Å–ª–µ–¥–Ω–∏–π –≤—Ä–µ–º—è —Å—Ç–∞—Ç—å –æ—á...,O
3,–≤–∫—É—Å –∏–º–µ—Ç—å –∫–∞–∂–¥—ã–π 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã–π —Å—Ç–∞—Ç—å –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä—ã–π –æ–±—ã—á–Ω—ã–π,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç –ª—é–±–∏—Ç—å –Ω–æ–≤–æ–≥–æ–¥–Ω–∏–π...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–π –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–π,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–π –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–π –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã–π –ø–æ–ª–æ–º–∞—Ç—å –º–∏...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞: 73




Epoch 1/20
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m32s[0m 144ms/step - accuracy: 0.1924 - loss: 2.1970 - val_accuracy: 0.2032 - val_loss: 2.1249
Epoch 2/20
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 116ms/step - accuracy: 0.2832 - loss: 1.9663 - val_accuracy: 0.3187 - val_loss: 1.8309
Epoch 3/20
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 126ms/step - accuracy: 0.4526 - loss: 1.5831 - val_accuracy: 0.4183 - val_loss: 1.6108
Epoch 4/20
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 132ms/step - accuracy: 0.5718 - loss: 1.2474 - val_accuracy: 0.4542 - val_loss: 1.5475
Epoch 5/20
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 130ms/step - accuracy: 0.6849 - loss: 0.9882 - val_accuracy: 0

<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.4701 - loss: 2.8788
Test Accuracy: 0.4701
Test Loss: 2.8788


[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m4s[0m 270ms/step
f1-score —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω 0.4730
accuracy —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω 0.4701
‚úÖ –ù–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run first_experiment_neural_network at: http://127.0.0.1:8080/#/experiments/0/runs/bd1c61c3551940db8aa3011106254c09
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_neural_network")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")

model_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.model_file}"
tokenizer_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.tokenizer}"
encoder_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.encoder}"

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")
try:
    model_file = client.download_artifacts(latest_run_model_id, model_path)
    tokenizer_file = client.download_artifacts(latest_run_model_id, tokenizer_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    print("–ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
model_keras = tf.keras.models.load_model(model_file)
encoder = joblib.load(encoder_file)
tokenizer = joblib.load(tokenizer_file)

print("–í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å")

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")

tokenized_text = tokenizer.texts_to_sequences([text])

max_len = model_keras.input_shape[1]
print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: {max_len}")

padded_text = pad_sequences(tokenized_text, maxlen=max_len, padding='post', truncating='post')

pred = model_keras.predict(padded_text)

predicted_class_ind = np.argmax(pred, axis=1)
predicted_class = encoder.inverse_transform(predicted_class_ind)

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {predicted_class[0]}')
print(f'–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ –∫–ª–∞—Å—Å–∞–º:')
for i, prob in enumerate(pred[0]):
    class_name = encoder.inverse_transform([i])[0]
    print(f'  {class_name}: {prob:.4f}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: bd1c61c3551940db8aa3011106254c09
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.19it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.73it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 11.59it/s]


‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
‚úÖ –í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: 73
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 2s/step
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–í —Ü–µ–ª–æ–º, —á–∏–ø—Å—ã –ê—à–∞–Ω –ö—Ä–∞—Å–Ω–∞—è –ø—Ç–∏—Ü–∞ –ë–∞—Ä–±–µ–∫—é –≤–ø–æ–ª–Ω–µ —Å—ä–µ–¥–æ–±–Ω—ã–µ"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_NEUTRAL
–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ –∫–ª–∞—Å—Å–∞–º:
  O: 0.0332
  –í–ö–£–°_NEGATIVE: 0.0001
  –í–ö–£–°_NEUTRAL: 0.9660
  –í–ö–£–°_POSITIVE: 0.0002
  –ü–ê–ß–ö–ê_NEGATIVE: 0.0001
  –ü–ê–ß–ö–ê_NEUTRAL: 0.0002
  –ü–ê–ß–ö–ê_POSITIVE: 0.0000
  –¢–ï–ö–°–¢–£–†–ê_NEGATIVE: 0.0000
  –¢–ï–ö–°–¢–£–†–ê_NEUTRAL: 0.0001
  –¢–ï–ö–°–¢–£–†–ê_POSITIVE: 0.0002


### –í—Ç–æ—Ä–æ–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:

cfg = load_config("neural_network_second")

with mlflow.start_run(run_name='second_experiment_neural_network'):
    
    mlflow.set_tag('CNN', cfg.model.version)

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    client = MlflowClient()

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")

    try:
        dataset_path = f"{cfg.data.dataset_path}/{cfg.data.dataset_file}"
        art = client.download_artifacts(dataset_run_id, dataset_path)
        df = pd.read_csv(art)
        print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {dataset_path}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        alternative_paths = [
            cfg.data.dataset_file,
            f"artifacts/{cfg.data.dataset_path}/{cfg.data.dataset_file}",
            "Second_version.csv"
        ]
        
        for path in alternative_paths:
            try:
                art = client.download_artifacts(dataset_run_id, path)
                df = pd.read_csv(art)
                print(f"‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                break
            except:
                continue
        else:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df)

    # =====================================================================================================================================
    #                                         –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    mlflow.tensorflow.autolog()

    tokenizer_config = {k: v for k, v in cfg.tokenizer.items() if v is not None}
    tokenizer = Tokenizer(**tokenizer_config)
    
    tokenizer.fit_on_texts(df['span'])

    df_train, df_temp = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state
    )
    
    df_test, df_val = train_test_split(
        df_temp, 
        test_size=cfg.training.val_size, 
        random_state=cfg.training.random_state
    )

    X_train_vec = tokenizer.texts_to_sequences(df_train['span'])
    X_test_vec = tokenizer.texts_to_sequences(df_test['span'])
    X_val_vec = tokenizer.texts_to_sequences(df_val['span'])

    X_train_pad = pad_sequences(X_train_vec, maxlen=cfg.training.max_sequence_length, padding='post', truncating='post')
    X_test_pad = pad_sequences(X_test_vec, maxlen=cfg.training.max_sequence_length, padding='post', truncating='post')
    X_val_pad = pad_sequences(X_val_vec, maxlen=cfg.training.max_sequence_length, padding='post', truncating='post')

    # =====================================================================================================================================
    #                                         –ü–û–î–ì–û–¢–û–í–ö–ê –¢–ê–†–ì–ï–¢–û–í
    # =====================================================================================================================================

    encoder = LabelEncoder()

    y_train = encoder.fit_transform(df_train['label'])
    y_test = encoder.transform(df_test['label'])
    y_val = encoder.transform(df_val['label'])

    # =====================================================================================================================================
    #                                         –°–û–ó–î–ê–ù–ò–ï –ú–û–î–ï–õ–ò CNN
    # =====================================================================================================================================

    model = Sequential()

    model.add(Embedding(
        input_dim=cfg.tokenizer.num_words, 
        output_dim=cfg.model.embedding_dim, 
        input_length=cfg.training.max_sequence_length
    ))

    model.add(Conv1D(
        filters=cfg.model.conv_filters,
        kernel_size=cfg.model.conv_kernel_size,
        activation=cfg.model.conv_activation
    ))

    model.add(Dropout(cfg.model.dropout_rate_1))

    model.add(GlobalMaxPooling1D())

    model.add(Dense(cfg.model.dense_units_1, activation=cfg.model.dense_activation_1))

    model.add(Dropout(cfg.model.dropout_rate_2))

    model.add(Dense(cfg.model.dense_units_2, activation=cfg.model.dense_activation_2))

    model.compile(
        loss="sparse_categorical_crossentropy", 
        optimizer='adam', 
        metrics=['accuracy']
    )

    model.fit(
        X_train_pad, 
        y_train, 
        batch_size=cfg.training.batch_size, 
        epochs=cfg.training.epochs, 
        validation_data=(X_val_pad, y_val)
    )

    model.summary()

    test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f}")

    # =====================================================================================================================================
    #                                         –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    pred = model.predict(X_test_pad)
    pred_class = np.argmax(pred, axis=1)

    f1 = f1_score(pred_class, y_test, average='weighted')
    accuracy = accuracy_score(pred_class, y_test)

    print(f'f1-score —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω {f1:.4f}')
    print(f'accuracy —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω {accuracy:.4f}')

    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('test_accuracy', test_accuracy)
    mlflow.log_metric('test_loss', test_loss)

    model.save(cfg.model.artifacts.model)

    joblib.dump(tokenizer, cfg.model.artifacts.tokenizer)
    joblib.dump(encoder, cfg.model.artifacts.encoder)

    mlflow.log_artifact(cfg.model.artifacts.model, 'models')
    mlflow.log_artifact(cfg.model.artifacts.tokenizer, 'models')
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'models')

    mlflow.log_params({
        'num_words': cfg.tokenizer.num_words,
        'embedding_dim': cfg.model.embedding_dim,
        'conv_filters': cfg.model.conv_filters,
        'conv_kernel_size': cfg.model.conv_kernel_size,
        'dropout_rate_1': cfg.model.dropout_rate_1,
        'dropout_rate_2': cfg.model.dropout_rate_2,
        'dense_units_1': cfg.model.dense_units_1,
        'batch_size': cfg.training.batch_size,
        'epochs': cfg.training.epochs,
        'max_sequence_length': cfg.training.max_sequence_length
    })

    print("‚úÖ CNN –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: ba68188717234d8ea8f01745d4816ea8


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.70it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: datasets/second_experiment_dataset.csv


Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å–æ...,O
2,—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ...,O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç —è –ª—é–±–ª—é –Ω–æ–≤–æ–≥–æ–¥–Ω–µ...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–µ –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º–∏ —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–µ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–µ –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–µ –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã—Ö –ø–æ–ª–æ–º–∞–Ω–Ω—ã—Ö ...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE




Epoch 1/40
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.1939 - loss: 2.2327 - val_accuracy: 0.1753 - val_loss: 2.2185
Epoch 2/40
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2094 - loss: 2.1531 - val_accuracy: 0.1873 - val_loss: 2.1567
Epoch 3/40
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2662 - loss: 2.0085 - val_accuracy: 0.2829 - val_loss: 2.0674
Epoch 4/40
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3988 - loss: 1.7389 - val_accuracy: 0.4104 - val_loss: 1.9122
Epoch 5/40
[1m126/126[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5040 - loss: 1.4663 - val_accuracy: 0.4183 - val_lo

<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4024 - loss: 3.4085 
Test Accuracy: 0.4024
Test Loss: 3.4085
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step
f1-score —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω 0.4025
accuracy —É –º–æ–¥–µ–ª–∏ —Ä–∞–≤–µ–Ω 0.4024
‚úÖ CNN –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run second_experiment_neural_network at: http://127.0.0.1:8080/#/experiments/0/runs/7d097ddcda574eb39cb841127b8e31f3
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_neural_network_second")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")

model_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.model_file}"
tokenizer_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.tokenizer}"
encoder_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.encoder}"

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")
try:
    model_file = client.download_artifacts(latest_run_model_id, model_path)
    tokenizer_file = client.download_artifacts(latest_run_model_id, tokenizer_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    print("–ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
model_keras = tf.keras.models.load_model(model_file)
encoder = joblib.load(encoder_file)
tokenizer = joblib.load(tokenizer_file)

print("–í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å")

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================


text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")


tokenized_text = tokenizer.texts_to_sequences([text])


max_len = model_keras.input_shape[1]
print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: {max_len}")


padded_text = pad_sequences(tokenized_text, maxlen=max_len, padding='post', truncating='post')


pred = model_keras.predict(padded_text)


predicted_class_ind = np.argmax(pred, axis=1)
predicted_class = encoder.inverse_transform(predicted_class_ind)

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {predicted_class[0]}')
print(f'–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ –∫–ª–∞—Å—Å–∞–º:')
for i, prob in enumerate(pred[0]):
    class_name = encoder.inverse_transform([i])[0]
    print(f'  {class_name}: {prob:.4f}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: 7d097ddcda574eb39cb841127b8e31f3
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.72it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  9.04it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 40.20it/s]


‚úÖ –ú–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
‚úÖ –í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: 100
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 133ms/step
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–í —Ü–µ–ª–æ–º, —á–∏–ø—Å—ã –ê—à–∞–Ω –ö—Ä–∞—Å–Ω–∞—è –ø—Ç–∏—Ü–∞ –ë–∞—Ä–±–µ–∫—é –≤–ø–æ–ª–Ω–µ —Å—ä–µ–¥–æ–±–Ω—ã–µ"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_NEUTRAL
–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ –∫–ª–∞—Å—Å–∞–º:
  O: 0.0063
  –í–ö–£–°_NEGATIVE: 0.0010
  –í–ö–£–°_NEUTRAL: 0.7726
  –í–ö–£–°_POSITIVE: 0.0001
  –ü–ê–ß–ö–ê_NEGATIVE: 0.0600
  –ü–ê–ß–ö–ê_NEUTRAL: 0.1581
  –ü–ê–ß–ö–ê_POSITIVE: 0.0019
  –¢–ï–ö–°–¢–£–†–ê_NEGATIVE: 0.0000
  –¢–ï–ö–°–¢–£–†–ê_NEUTRAL: 0.0001
  –¢–ï–ö–°–¢–£–†–ê_POSITIVE: 0.0000


### –¢—Ä–µ—Ç–∏–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("neural_network_third")

with mlflow.start_run(run_name='third_experiment_neural_network'):
    
    mlflow.set_tag('Bidirectional_GRU', cfg.model.version)
    client = MlflowClient()

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")

    try:
        dataset_path = f"{cfg.data.dataset_path}/{cfg.data.dataset_file}"
        art = client.download_artifacts(dataset_run_id, dataset_path)
        df = pd.read_csv(art)
        print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {dataset_path}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        alternative_paths = [
            cfg.data.dataset_file,
            f"artifacts/{cfg.data.dataset_path}/{cfg.data.dataset_file}",
            "third_experiment_dataset.csv"
        ]
        
        for path in alternative_paths:
            try:
                art = client.download_artifacts(dataset_run_id, path)
                df = pd.read_csv(art)
                print(f"‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                break
            except:
                continue
        else:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df.head())

    # =====================================================================================================================================
    #                                         –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–Ø –¢–ï–ö–°–¢–ê
    # =====================================================================================================================================

    mlflow.tensorflow.autolog()
    mlflow.sklearn.autolog()

    # –°–æ–∑–¥–∞–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Å –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏ –∏–∑ –∫–æ–Ω—Ñ–∏–≥–∞
    tokenizer_config = {k: v for k, v in cfg.tokenizer.items() if v is not None}
    tokenizer = Tokenizer(**tokenizer_config)
    
    tokenizer.fit_on_texts(df['span'])

    sequences = tokenizer.texts_to_sequences(df['span'])

    # –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π –¥–ª–∏–Ω—ã
    max_len = max(len(seq) for seq in sequences)
    print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: {max_len}")

    X = pad_sequences(sequences, maxlen=max_len)

    # =====================================================================================================================================
    #                                         –ü–†–ï–û–ë–†–ê–ó–û–í–ê–ù–ò–ï –¢–ê–†–ì–ï–¢–ê
    # =====================================================================================================================================

    encoder = LabelEncoder()
    y = encoder.fit_transform(df['label'])
    num_classes = len(encoder.classes_)
    
    # =====================================================================================================================================
    #                                         –†–ê–ó–î–ï–õ–ï–ù–ò–ï –ù–ê TRAIN/TEST/VAL
    # =====================================================================================================================================

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, 
        train_size=1-cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=y
    )

    X_test, X_val, y_test, y_val = train_test_split(
        X_temp, y_temp, 
        test_size=cfg.training.val_size, 
        random_state=cfg.training.random_state, 
        stratify=y_temp
    )

    print(f"üìä –†–∞–∑–º–µ—Ä—ã –¥–∞–Ω–Ω—ã—Ö:")
    print(f"   Train: {X_train.shape}, {y_train.shape}")
    print(f"   Test: {X_test.shape}, {y_test.shape}")
    print(f"   Val: {X_val.shape}, {y_val.shape}")

    # =====================================================================================================================================
    #                                         –°–û–ó–î–ê–ù–ò–ï –ò –û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    model = Sequential()

    model.add(Embedding(
        input_dim=cfg.tokenizer.num_words, 
        output_dim=cfg.model.embedding_dim
    ))

    model.add(Bidirectional(GRU(cfg.model.gru_units)))

    model.add(Dense(cfg.model.dense_units, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(
        loss="sparse_categorical_crossentropy", 
        optimizer='adam', 
        metrics=['accuracy']
    )

    model.fit(
        X_train, y_train, 
        epochs=cfg.training.epochs, 
        batch_size=cfg.training.batch_size, 
        validation_data=(X_val, y_val)
    )

    # =====================================================================================================================================
    #                                         –û–¶–ï–ù–ö–ê –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    model.summary()

    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss:.4f}')
    print(f'Test Accuracy: {accuracy:.4f}')

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)

    f1 = f1_score(y_pred, y_test, average='weighted')
    acc_score = accuracy_score(y_pred, y_test)

    print(f'F1-score: {f1:.4f}')
    print(f'Accuracy: {acc_score:.4f}')

    # =====================================================================================================================================
    #                                         –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–ï–¢–†–ò–ö
    # =====================================================================================================================================

    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', acc_score)
    mlflow.log_metric('test_loss', loss)
    mlflow.log_metric('test_accuracy', accuracy)

    # =====================================================================================================================================
    #                                         –°–û–•–†–ê–ù–ï–ù–ò–ï –ê–†–¢–ï–§–ê–ö–¢–û–í
    # =====================================================================================================================================

    model.save(cfg.model.artifacts.model)
    mlflow.log_artifact(cfg.model.artifacts.model, 'models')

    with open(cfg.model.artifacts.tokenizer, 'wb') as f:
        pickle.dump(tokenizer, f)
    mlflow.log_artifact(cfg.model.artifacts.tokenizer, 'preprocessing')

    with open(cfg.model.artifacts.encoder, 'wb') as f:
        pickle.dump(encoder, f)
    mlflow.log_artifact(cfg.model.artifacts.encoder, 'preprocessing')

    preprocessing_info = {
        'vocab_size': len(tokenizer.word_index),
        'max_sequence_length': max_len,
        'num_classes': num_classes,
        'classes': list(encoder.classes_)
    }
    
    with open(cfg.model.artifacts.preprocessing_info, 'wb') as f:
        pickle.dump(preprocessing_info, f)
    mlflow.log_artifact(cfg.model.artifacts.preprocessing_info, 'preprocessing')

    # =====================================================================================================================================
    #                                         –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ü–ê–†–ê–ú–ï–¢–†–û–í
    # =====================================================================================================================================

    mlflow.log_params({
        "model_type": "Bidirectional_GRU",
        "embedding_dim": cfg.model.embedding_dim,
        "gru_units": cfg.model.gru_units,
        "dense_units": cfg.model.dense_units,
        "num_classes": num_classes,
        "vocab_size": len(tokenizer.word_index),
        "max_sequence_length": max_len,
        "batch_size": cfg.training.batch_size,
        "epochs": cfg.training.epochs
    })

    print("Bidirectional GRU –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: 327f7f34ba914fce83ca6c0e787e8f15


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10.13it/s]

‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: datasets/third_experiment_dataset.csv





Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,"–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...",O
2,"—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ, –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä...",O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –ù–ê–ú–ù–û–ì–û –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE




üìè –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: 85
üìä –†–∞–∑–º–µ—Ä—ã –¥–∞–Ω–Ω—ã—Ö:
   Train: (3216, 85), (3216,)
   Test: (402, 85), (402,)
   Val: (402, 85), (402,)
Epoch 1/10
[1m101/101[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m22s[0m 158ms/step - accuracy: 0.4667 - loss: 1.5992 - val_accuracy: 0.4577 - val_loss: 2.0455
Epoch 2/10
[1m101/101[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 151ms/step - accuracy: 0.7848 - loss: 0.6825 - val_accuracy: 0.5746 - val_loss: 1.6586
Epoch 3/10
[1m101/101[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 146ms/step - accuracy: 0.8859 - loss: 0.4023 - val_accuracy: 0.6791 - val_loss: 1.1612
Epoch 4/10
[1m101/101[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 159ms/step - accuracy: 0.9288 - loss: 0.2556 - val_accuracy: 0.6318 

<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


[1m13/13[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.7189 - loss: 1.4945
üìä Test Loss: 1.4945
üìä Test Accuracy: 0.7189
[1m13/13[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 75ms/step
üéØ F1-score: 0.7280
üéØ Accuracy: 0.7189
‚úÖ Bidirectional GRU –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞ –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞!
üèÉ View run third_experiment_neural_network at: http://127.0.0.1:8080/#/experiments/0/runs/8d1d38be345646f5bccb64306e83cff6
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_neural_network_third")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'attributes.run_name = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")

model_path = f"{cfg_inference.model.artifacts_path}/{cfg_inference.model.model_file}"
tokenizer_path = f"preprocessing/{cfg_inference.model.tokenizer}"
encoder_path = f"preprocessing/{cfg_inference.model.encoder}"
info_path = f"preprocessing/{cfg_inference.model.preprocessing_info}"

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")
try:
    model_file = client.download_artifacts(latest_run_model_id, model_path)
    tokenizer_file = client.download_artifacts(latest_run_model_id, tokenizer_path)
    encoder_file = client.download_artifacts(latest_run_model_id, encoder_path)
    info_file = client.download_artifacts(latest_run_model_id, info_path)
    print("–í—Å–µ –º–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
model_keras = tf.keras.models.load_model(model_file)

with open(tokenizer_file, 'rb') as f:
    tokenizer = pickle.load(f)

with open(encoder_file, 'rb') as f:
    encoder = pickle.load(f)

with open(info_file, 'rb') as f:
    preprocessing_info = pickle.load(f)

print("–í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å")
print(f"–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–µ: {preprocessing_info}")

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")

tokenized_text = tokenizer.texts_to_sequences([text])
print(f"–¢–æ–∫–µ–Ω–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: {tokenized_text}")

max_len = preprocessing_info['max_sequence_length']
print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: {max_len}")

padded_text = pad_sequences(tokenized_text, maxlen=max_len, padding='post', truncating='post')
print(f"–¢–µ–∫—Å—Ç –ø–æ—Å–ª–µ –ø–∞–¥–¥–∏–Ω–≥–∞: {padded_text}")

pred = model_keras.predict(padded_text)
print(f"–°—ã—Ä—ã–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è: {pred}")

predicted_class_ind = np.argmax(pred, axis=1)
predicted_class = encoder.inverse_transform(predicted_class_ind)
confidence = np.max(pred, axis=1)[0]

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {predicted_class[0]}')
print(f'–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: {confidence:.4f}')
print(f'–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:')
for i, prob in enumerate(pred[0]):
    class_name = encoder.inverse_transform([i])[0]
    print(f'  {class_name}: {prob:.4f}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: 8d1d38be345646f5bccb64306e83cff6
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.29s/it]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  9.73it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 57.71it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 46.18it/s]


‚úÖ –í—Å–µ –º–æ–¥–µ–ª–∏ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
‚úÖ –í—Å–µ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å
üìä –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–µ: {'vocab_size': 5376, 'max_sequence_length': 85, 'num_classes': 10, 'classes': ['O', '–í–ö–£–°_NEGATIVE', '–í–ö–£–°_NEUTRAL', '–í–ö–£–°_POSITIVE', '–ü–ê–ß–ö–ê_NEGATIVE', '–ü–ê–ß–ö–ê_NEUTRAL', '–ü–ê–ß–ö–ê_POSITIVE', '–¢–ï–ö–°–¢–£–†–ê_NEGATIVE', '–¢–ï–ö–°–¢–£–†–ê_NEUTRAL', '–¢–ï–ö–°–¢–£–†–ê_POSITIVE']}
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–¢–æ–∫–µ–Ω–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç: [[43, 46, 1532, 5, 2, 1]]
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏: 85
–¢–µ–∫—Å—Ç –ø–æ—Å–ª–µ –ø–∞–¥–¥–∏–Ω–≥–∞: [[  43   46 1532    5    2    1    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0  

## –¢—Ä–µ—Ç–∏–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç (–¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä—ã)

### –ü–µ—Ä–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("transformers_first")

with mlflow.start_run(run_name='transformers_experiment_1'):
    
    mlflow.set_tag('Transformers', cfg.model.version)
    client = MlflowClient()

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================
    

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"–ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")


    try:
        dataset_path = f"{cfg.data.dataset_path}/{cfg.data.dataset_file}"
        art_loc = client.download_artifacts(dataset_run_id, dataset_path)
        df = pd.read_csv(art_loc)
        print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {dataset_path}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")

        alternative_paths = [
            cfg.data.dataset_file,
            f"artifacts/{cfg.data.dataset_path}/{cfg.data.dataset_file}",
            "First_version.csv"
        ]
        
        for path in alternative_paths:
            try:
                art_loc = client.download_artifacts(dataset_run_id, path)
                df = pd.read_csv(art_loc)
                print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                break
            except:
                continue
        else:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")


    df = df[[cfg.data.text_column, cfg.data.label_column]]
    df = df.rename(columns={cfg.data.label_column: "labels"})
    display(df)

    # =====================================================================================================================================
    #                                         –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–¢–ê–°–ï–¢–ê/–¢–ê–†–ì–ï–¢–û–í
    # =====================================================================================================================================
    
    df_train, df_temp = train_test_split(
        df, 
        test_size=cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['labels']
    )
    
    df_test, df_val = train_test_split(
        df_temp, 
        test_size=cfg.training.val_size, 
        random_state=cfg.training.random_state, 
        stratify=df_temp['labels']
    )

    encoder = LabelEncoder()
    y_train = encoder.fit_transform(df_train['labels'])
    y_test = encoder.transform(df_test['labels'])
    y_val = encoder.transform(df_val['labels'])

    # =====================================================================================================================================
    #                                         –ü–û–î–ì–û–¢–û–í–ö–ê –ú–û–î–ï–õ–ò
    # =====================================================================================================================================
    
    mlflow.transformers.autolog()


    tokenizer = AutoTokenizer.from_pretrained(cfg.model.model_name)

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model.model_name, 
        num_labels=len(encoder.classes_), 
        id2label={i: label for i, label in enumerate(encoder.classes_)},
        label2id={label: i for i, label in enumerate(encoder.classes_)}
    )
    

    dataset_train = Dataset.from_pandas(df_train.assign(labels=y_train))    
    dataset_test = Dataset.from_pandas(df_test.assign(labels=y_test))    
    dataset_val = Dataset.from_pandas(df_val.assign(labels=y_val))


    def tokenize_dataset(row):
        tokenizer_config = {k: v for k, v in cfg.tokenizer.items() if v is not None}
        return tokenizer(row[cfg.data.text_column], **tokenizer_config)

    dataset_tokenized_train = dataset_train.map(tokenize_dataset, batched=False)
    dataset_tokenized_test = dataset_test.map(tokenize_dataset, batched=False)
    dataset_tokenized_val = dataset_val.map(tokenize_dataset, batched=False)


    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        return_tensors='pt'
    )
    

    training_args = TrainingArguments(
        output_dir=cfg.training.output_dir,
        overwrite_output_dir=True,
        logging_dir='./logs/',
        num_train_epochs=cfg.training.num_train_epochs,
        learning_rate=cfg.training.learning_rate,
        per_device_train_batch_size=cfg.training.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.training.per_device_eval_batch_size,
        eval_strategy=cfg.training.eval_strategy,
        save_strategy=cfg.training.save_strategy,
        warmup_ratio=cfg.training.warmup_ratio,
        lr_scheduler_type=cfg.training.lr_scheduler_type,
        metric_for_best_model=cfg.training.metric_for_best_model,
        weight_decay=cfg.training.weight_decay,
        load_best_model_at_end=cfg.training.load_best_model_at_end,
        save_total_limit=cfg.training.save_total_limit,
        max_grad_norm=cfg.training.max_grad_norm,
        logging_steps=cfg.training.logging_steps
    )


    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        return {
            'f1-score': f1_score(labels, predictions, average='weighted'),
            'accuracy': accuracy_score(labels, predictions)
        }


    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=dataset_tokenized_train,
        eval_dataset=dataset_tokenized_val,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.training.early_stopping_patience)]
    )


    trainer.train()


    final_metrics = trainer.evaluate(dataset_tokenized_test)
    print(f"–§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏: {final_metrics}")


    # =====================================================================================================================================
    #                                         –°–û–•–†–ê–ù–ï–ù–ò–ï –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    final_predictions = trainer.predict(dataset_tokenized_test)
    predictions = np.argmax(final_predictions.predictions, axis=1)
    labels = final_predictions.label_ids

    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy) 
    
    for key, value in final_predictions.metrics.items():
        if key.startswith('eval_'):
            clean_key = key.replace('eval_', '')
            mlflow.log_metric(clean_key, value)

    print(f"Final metrics - F1: {f1:.4f}, Accuracy: {accuracy:.4f}, Loss: {test_loss:.4f}")

    model_dir = cfg.artifacts.model_dir
    tokenizer_dir = cfg.artifacts.tokenizer_dir

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(tokenizer_dir, exist_ok=True)

    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(tokenizer_dir)

    mlflow.log_artifacts(model_dir, "model")
    mlflow.log_artifacts(tokenizer_dir, "tokenizer")

    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(encoder, f)
    mlflow.log_artifact('label_encoder.pkl')

    mlflow.log_params({
        'model_name': cfg.model.model_name,
        'num_labels': len(encoder.classes_),
        'num_train_epochs': cfg.training.num_train_epochs,
        'learning_rate': cfg.training.learning_rate,
        'batch_size': cfg.training.per_device_train_batch_size,
        'early_stopping_patience': cfg.training.early_stopping_patience
    })

    print("–¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: 8690112afeac4c7faf0867c5d30d2dd9


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.51it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: datasets/first_experiment_dataset.csv


Unnamed: 0,span,labels
0,–≤–∫—É—Å —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ—Å—Ç—å —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –≥–æ–¥ 2 –Ω–∞–∑–∞–¥ —Å–æ–≤–µ—Ç–æ–≤–∞—Ç...,O
2,—Ö–æ—Ç–µ—Ç—å—Å—è –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –ø–æ—Å–ª–µ–¥–Ω–∏–π –≤—Ä–µ–º—è —Å—Ç–∞—Ç—å –æ—á...,O
3,–≤–∫—É—Å –∏–º–µ—Ç—å –∫–∞–∂–¥—ã–π 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã–π —Å—Ç–∞—Ç—å –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä—ã–π –æ–±—ã—á–Ω—ã–π,–í–ö–£–°_NEGATIVE
...,...,...
2503,—è—Ä–∫–æ –∫—Ä–∞—Å–∏–≤–æ —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π —Ü–≤–µ—Ç –ª—é–±–∏—Ç—å –Ω–æ–≤–æ–≥–æ–¥–Ω–∏–π...,–ü–ê–ß–ö–ê_POSITIVE
2504,—á–∏–ø—Å—ã –¥–æ–≤–æ–ª—å–Ω–æ –∂–∏—Ä–Ω—ã–π –Ω–µ –∫—Ä–∏—Ç–∏—á–Ω–æ –∑–∞–º–µ—Ç–Ω–æ,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2505,—Å–∞–º —á–∏–ø—Å—ã –Ω–µ –∏–¥–µ–∞–ª—å–Ω–æ –∫—Ä—É–≥–ª—ã–π,–¢–ï–ö–°–¢–£–†–ê_NEUTRAL
2506,–∫—Ä–∞—Å–∏–≤—ã–π –∑–æ–ª–æ—Ç–∏—Å—Ç—ã–π –Ω–µ—Ç –ø—Ä–∏–≥–æ—Ä–µ–ª—ã–π –ø–æ–ª–æ–º–∞—Ç—å –º–∏...,–¢–ï–ö–°–¢–£–†–ê_POSITIVE


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2006/2006 [00:01<00:00, 1886.15 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251/251 [00:00<00:00, 2243.91 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251/251 [00:00<00:00, 2711.61 examples/s]


Epoch,Training Loss,Validation Loss,F1-score,Accuracy
1,2.2515,2.160471,0.069781,0.203187
2,2.0129,1.942594,0.267171,0.346614
3,1.85,1.767167,0.336707,0.398406
4,1.6944,1.668675,0.373185,0.438247
5,1.613,1.593321,0.389435,0.454183
6,1.5634,1.545394,0.426005,0.486056
7,1.4329,1.516056,0.445592,0.501992
8,1.3992,1.49122,0.44553,0.501992
9,1.4221,1.48173,0.450813,0.505976
10,1.4021,1.477343,0.453505,0.50996




‚úÖ –§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏: {'eval_loss': 1.4958806037902832, 'eval_f1-score': 0.44198462104907194, 'eval_accuracy': 0.5059760956175299, 'eval_runtime': 0.416, 'eval_samples_per_second': 603.431, 'eval_steps_per_second': 38.466, 'epoch': 10.0}




‚úÖ Final metrics - F1: 0.4420, Accuracy: 0.5060, Loss: 3.4085
‚úÖ –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!
üèÉ View run transformers_experiment_1 at: http://127.0.0.1:8080/#/experiments/0/runs/82086ed0872f47c58376be0d105fed8f
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞

In [None]:
cfg_inference = load_config("inference_transformers_first")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================


latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'tags.mlflow.runName = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")


print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä...")
try:

    model_dir = client.download_artifacts(latest_run_model_id, cfg_inference.model.artifacts_path)
    tokenizer_dir = client.download_artifacts(latest_run_model_id, cfg_inference.model.tokenizer_path)
    print("‚úÖ –ú–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

print("–ú–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å")

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================


text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")


tokenizer_config = {k: v for k, v in cfg_inference.tokenizer.items() if v is not None}
inputs = tokenizer(text, **tokenizer_config)


with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

predicted_class_idx = predictions.argmax().item()
predicted_prob = predictions.max().item()

id2label = model.config.id2label
predicted_label = id2label[predicted_class_idx]

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {predicted_label}')
print(f'–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: {predicted_prob:.4f}')
print(f'–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:')
for i, prob in enumerate(predictions[0]):
    label = id2label[i]
    print(f'  {label}: {prob:.4f}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: 82086ed0872f47c58376be0d105fed8f
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:10<00:00,  5.22s/it]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:07<00:00,  1.78s/it]


‚úÖ –ú–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
‚úÖ –ú–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–í —Ü–µ–ª–æ–º, —á–∏–ø—Å—ã –ê—à–∞–Ω –ö—Ä–∞—Å–Ω–∞—è –ø—Ç–∏—Ü–∞ –ë–∞—Ä–±–µ–∫—é –≤–ø–æ–ª–Ω–µ —Å—ä–µ–¥–æ–±–Ω—ã–µ"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_POSITIVE
–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: 0.3161
–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:
  O: 0.1456
  –í–ö–£–°_NEGATIVE: 0.1105
  –í–ö–£–°_NEUTRAL: 0.1792
  –í–ö–£–°_POSITIVE: 0.3161
  –ü–ê–ß–ö–ê_NEGATIVE: 0.0258
  –ü–ê–ß–ö–ê_NEUTRAL: 0.0288
  –ü–ê–ß–ö–ê_POSITIVE: 0.0389
  –¢–ï–ö–°–¢–£–†–ê_NEGATIVE: 0.0373
  –¢–ï–ö–°–¢–£–†–ê_NEUTRAL: 0.0438
  –¢–ï–ö–°–¢–£–†–ê_POSITIVE: 0.0740


### –í—Ç–æ—Ä–æ–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("transformers_second")

with mlflow.start_run(run_name='transformers_experiment_2'):
    
    mlflow.set_tag('Transformers', cfg.model.version)
    client = MlflowClient()

    # =====================================================================================================================================
    #                                         –°–ß–ò–¢–´–í–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê
    # =====================================================================================================================================

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_id],
        filter_string=f"tags.mlflow.runName = '{cfg.data.source_run}'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg.data.source_run}")

    dataset_run = dataset_runs[0]
    dataset_run_id = dataset_run.info.run_id
    print(f"–ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")

    try:
        dataset_path = f"{cfg.data.dataset_path}/{cfg.data.dataset_file}"
        art_loc = client.download_artifacts(dataset_run_id, dataset_path)
        df = pd.read_csv(art_loc)
        print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {dataset_path}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")

        alternative_paths = [
            cfg.data.dataset_file,
            f"artifacts/{cfg.data.dataset_path}/{cfg.data.dataset_file}",
            "second_experiment_dataset.csv"
        ]
        
        for path in alternative_paths:
            try:
                art_loc = client.download_artifacts(dataset_run_id, path)
                df = pd.read_csv(art_loc)
                print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: {path}")
                break
            except:
                continue
        else:
            raise ValueError("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç")

    display(df.head())

    # =====================================================================================================================================
    #                                         –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–•
    # =====================================================================================================================================

    encoder = LabelEncoder()
    df['label_encoded'] = encoder.fit_transform(df[cfg.data.label_column])
    df = df.rename(columns={'label_encoded': 'labels'})
    df = df[[cfg.data.text_column, 'labels']]

    # =====================================================================================================================================
    #                                         –ü–û–î–ì–û–¢–û–í–ö–ê –ú–û–î–ï–õ–ò
    # =====================================================================================================================================

    tokenizer = AutoTokenizer.from_pretrained(cfg.model.model_name)

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model.model_name,
        num_labels=len(encoder.classes_),
        id2label={i: label for i, label in enumerate(encoder.classes_)},
        label2id={label: i for i, label in enumerate(encoder.classes_)}
    )
    
    # =====================================================================================================================================
    #                                         –†–ê–ó–î–ï–õ–ï–ù–ò–ï –î–ê–ù–ù–´–•
    # =====================================================================================================================================

    df_train, df_temp = train_test_split(
        df, 
        train_size=1-cfg.training.test_size, 
        random_state=cfg.training.random_state, 
        stratify=df['labels']
    )
    
    df_test, df_val = train_test_split(
        df_temp, 
        test_size=cfg.training.val_size, 
        random_state=cfg.training.random_state, 
        stratify=df_temp['labels']
    )

    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_test)
    dataset_val = Dataset.from_pandas(df_val)

    # =====================================================================================================================================
    #                                         –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–Ø
    # =====================================================================================================================================

    def tokenize_dataset(batch):
        tokenizer_config = {k: v for k, v in cfg.tokenizer.items() if v is not None}
        return tokenizer(
            batch[cfg.data.text_column],
            **tokenizer_config
        )

    tokenized_train = dataset_train.map(tokenize_dataset, batched=True)
    tokenized_test = dataset_test.map(tokenize_dataset, batched=True)
    tokenized_val = dataset_val.map(tokenize_dataset, batched=True)

    # =====================================================================================================================================
    #                                         DATA COLLATOR
    # =====================================================================================================================================

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        max_length=cfg.tokenizer.max_length,
        return_tensors="pt"
    )

    # =====================================================================================================================================
    #                                         TRAINING ARGUMENTS
    # =====================================================================================================================================

    training_args = TrainingArguments(
        output_dir=cfg.training.output_dir,
        num_train_epochs=cfg.training.num_train_epochs,
        learning_rate=cfg.training.learning_rate,
        per_device_train_batch_size=cfg.training.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.training.per_device_eval_batch_size,
        eval_strategy=cfg.training.eval_strategy,
        save_strategy=cfg.training.save_strategy,
        logging_dir=cfg.training.logging_dir,
        logging_steps=cfg.training.logging_steps,
        load_best_model_at_end=cfg.training.load_best_model_at_end,
        metric_for_best_model=cfg.training.metric_for_best_model
    )

    # =====================================================================================================================================
    #                                         –ú–ï–¢–†–ò–ö–ò
    # =====================================================================================================================================

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        y_pred = np.argmax(predictions, axis=1)

        return {
            'f1_score': f1_score(y_pred, labels, average='weighted'),
            'accuracy': accuracy_score(y_pred, labels)
        }
    
    # =====================================================================================================================================
    #                                         TRAINER –ò –û–ë–£–ß–ï–ù–ò–ï
    # =====================================================================================================================================

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    # =====================================================================================================================================
    #                                         –û–¶–ï–ù–ö–ê –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–ï–¢–†–ò–ö
    # =====================================================================================================================================

    test_predictions = trainer.predict(tokenized_test)
    test_preds = np.argmax(test_predictions.predictions, axis=1)
    test_labels = test_predictions.label_ids

    accuracy = accuracy_score(test_labels, test_preds)
    f1_score_value = f1_score(test_labels, test_preds, average='weighted')

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1_score_value)

    print(f"üéØ –§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:")
    print(f"   accuracy: {accuracy:.4f}")
    print(f"   f1_score: {f1_score_value:.4f}")

    # =====================================================================================================================================
    #                                         –°–û–•–†–ê–ù–ï–ù–ò–ï –ê–†–¢–ï–§–ê–ö–¢–û–í
    # =====================================================================================================================================

    model_dir = cfg.artifacts.model_dir
    tokenizer_dir = cfg.artifacts.tokenizer_dir
    
    import shutil
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)
    if os.path.exists(tokenizer_dir):
        shutil.rmtree(tokenizer_dir)

    trainer.save_model(model_dir)
    tokenizer.save_pretrained(tokenizer_dir)

    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(encoder, f)

    mlflow.log_artifacts(model_dir, "model")
    mlflow.log_artifacts(tokenizer_dir, "tokenizer")
    mlflow.log_artifact('label_encoder.pkl')

    # =====================================================================================================================================
    #                                         –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ü–ê–†–ê–ú–ï–¢–†–û–í
    # =====================================================================================================================================

    mlflow.log_params({
        'model_name': cfg.model.model_name,
        'num_labels': len(encoder.classes_),
        'num_train_epochs': cfg.training.num_train_epochs,
        'learning_rate': cfg.training.learning_rate,
        'batch_size': cfg.training.per_device_train_batch_size,
        'test_size': cfg.training.test_size,
        'val_size': cfg.training.val_size
    })

    # =====================================================================================================================================
    #                                         –û–ß–ò–°–¢–ö–ê –í–†–ï–ú–ï–ù–ù–´–• –§–ê–ô–õ–û–í
    # =====================================================================================================================================

    shutil.rmtree(model_dir)
    shutil.rmtree(tokenizer_dir)
    os.remove('label_encoder.pkl')

    print("–¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: ba68188717234d8ea8f01745d4816ea8


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.30it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω: datasets/second_experiment_dataset.csv


Unnamed: 0,span,label
0,–≤–∫—É—Å –±—ã–ª —Ä–µ–∞–ª—å–Ω–æ –æ—Ç–ª–∏—á–Ω—ã–π,–í–ö–£–°_POSITIVE
1,—è –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å–æ...,O
2,—Ö–æ—Ç–µ–ª–æ—Å—å –∫—É–ø–∏—Ç—å –æ—á–µ–Ω—å –º–Ω–æ–≥–æ –Ω–æ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ...,O
3,—Å–µ–π—á–∞—Å —Ç–∞–∫–æ–π –≤–∫—É—Å –∏–º–µ–µ—Ç –∫–∞–∂–¥–∞—è 2 –ø–∞—á–∫–∞,O
4,–≤–∫—É—Å –∫–æ—Ç–æ—Ä—ã—Ö —Å—Ç–∞–ª –Ω–∞–º–Ω–æ–≥–æ –æ—Å—Ç—Ä–µ–µ –æ–±—ã—á–Ω–æ–≥–æ,–í–ö–£–°_NEGATIVE


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2006/2006 [00:00<00:00, 3138.29 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251/251 [00:00<00:00, 4944.63 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251/251 [00:00<00:00, 6745.76 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Score,Accuracy
1,2.076,2.079241,0.366077,0.270916
2,1.9141,1.852924,0.46499,0.394422
3,1.721,1.722221,0.48809,0.422311
4,1.5658,1.661844,0.47857,0.418327
5,1.4658,1.59555,0.531171,0.462151
6,1.4693,1.566464,0.516893,0.454183
7,1.4308,1.547074,0.504289,0.446215
8,1.3267,1.541347,0.499597,0.442231
9,1.3114,1.532064,0.498807,0.442231
10,1.3471,1.527224,0.498972,0.446215




üéØ –§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:
   accuracy: 0.4422
   f1_score: 0.3732
‚úÖ –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!
üèÉ View run transformers_experiment_2 at: http://127.0.0.1:8080/#/experiments/0/runs/734f31b8cd0c48ada580eb519ae9b457
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [None]:
cfg_inference = load_config("inference_transformers_second")

client = MlflowClient()

# ===========================================================================================
# –ü–û–õ–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

latest_run_model = client.search_runs(
    experiment_ids=[cfg_inference.mlflow.experiment_id],
    filter_string=f'attributes.run_name = "{cfg_inference.model.run_name}"',
    order_by=['attributes.end_time desc']
)

if not latest_run_model:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω run: {cfg_inference.model.run_name}")

latest_run_model_id = latest_run_model[0].info.run_id
print(f"Run ID –º–æ–¥–µ–ª–∏: {latest_run_model_id}")

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä...")
try:
    model_dir = client.download_artifacts(latest_run_model_id, 'model')
    tokenizer_dir = client.download_artifacts(latest_run_model_id, 'tokenizer')
    encoder_path = client.download_artifacts(latest_run_model_id, 'label_encoder.pkl')
    print("–ú–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
    raise

# ===========================================================================================
# –ó–ê–ì–†–£–ó–ö–ê –ò –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ï–ô
# ===========================================================================================

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...")
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

with open(encoder_path, 'rb') as f:
    encoder = pickle.load(f)

print("–ú–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å")

# ===========================================================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# ===========================================================================================

text = cfg_inference.test_text

print("–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...")

tokenizer_config = {k: v for k, v in cfg_inference.tokenizer.items() if v is not None}
inputs = tokenizer(text, **tokenizer_config)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

predicted_class_idx = predictions.argmax().item()
predicted_prob = predictions.max().item()

predicted_label = encoder.inverse_transform([predicted_class_idx])[0]

print('=' * 100)
print(f'–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "{text}"')
print(f'–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {predicted_label}')
print(f'–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: {predicted_prob:.4f}')
print(f'–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:')
for i, prob in enumerate(predictions[0]):
    label = encoder.inverse_transform([i])[0]
    print(f'  {label}: {prob:.4f}')
print('=' * 100)

Run ID –º–æ–¥–µ–ª–∏: 734f31b8cd0c48ada580eb519ae9b457
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:06<00:00,  1.00it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.81it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 43.23it/s]


‚úÖ –ú–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...
‚úÖ –ú–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –ø–∞–º—è—Ç—å
–í—ã–ø–æ–ª–Ω—è–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: "–ü—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏ –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ"
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –ü–ê–ß–ö–ê_NEUTRAL
–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: 0.2678
–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:
  O: 0.0901
  –í–ö–£–°_NEGATIVE: 0.0184
  –í–ö–£–°_NEUTRAL: 0.0321
  –í–ö–£–°_POSITIVE: 0.0212
  –ü–ê–ß–ö–ê_NEGATIVE: 0.0978
  –ü–ê–ß–ö–ê_NEUTRAL: 0.2678
  –ü–ê–ß–ö–ê_POSITIVE: 0.1886
  –¢–ï–ö–°–¢–£–†–ê_NEGATIVE: 0.0811
  –¢–ï–ö–°–¢–£–†–ê_NEUTRAL: 0.1246
  –¢–ï–ö–°–¢–£–†–ê_POSITIVE: 0.0784


### –¢—Ä–µ—Ç–∏–π –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
cfg = load_config("transformers_third")

with mlflow.start_run(run_name=cfg.mlflow.run_name):

    client = MlflowClient()

    mlflow.set_tag("Dataset_version", cfg.mlflow.dataset_version)
    mlflow.log_param("model_name", cfg.model.name)

    # ---------------------------
    #   –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
    # ---------------------------

    dataset_runs = client.search_runs(
        experiment_ids=[cfg.mlflow.experiment_name], 
        filter_string=f"tags.mlflow.runName = 'Third dataset'",
        order_by=['attributes.end_time desc']
    )

    if not dataset_runs:
        raise ValueError("–ù–µ –Ω–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º 'Third dataset'")

    dataset_run_id = dataset_runs[0].info.run_id
    print(f"–ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: {dataset_run_id}")

    dataset_path_in_run = "datasets/third_experiment_dataset.csv" 
    artifact_local_path = client.download_artifacts(dataset_run_id, dataset_path_in_run)

    df = pd.read_csv(artifact_local_path)
    print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –∏–∑ run: {artifact_local_path}")

    # ---------------------------
    #   –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–•
    # ---------------------------

    encoder = LabelEncoder()
    df["labels"] = encoder.fit_transform(df["label"])
    df["labels"] = df["labels"].astype(int)

    mlflow.log_param("num_labels", len(encoder.classes_))

    df_train, df_temp = train_test_split(
        df,
        test_size=1 - cfg.data.train_size,
        random_state=cfg.data.random_state,
        shuffle=True,
        stratify=df["labels"]
    )

    df_val, df_test = train_test_split(
        df_temp,
        test_size=cfg.data.val_size,
        random_state=cfg.data.random_state,
        shuffle=True,
        stratify=df_temp["labels"]
    )

    print(f"–†–∞–∑–º–µ—Ä—ã –¥–∞–Ω–Ω—ã—Ö: Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")

    # ---------------------------
    #   –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–¢–ê–°–ï–¢–û–í (–ò–°–ü–†–ê–í–õ–ï–ù–ù–ê–Ø)
    # ---------------------------

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    dataset_train = Dataset.from_dict({
        "text": df_train["span"].tolist(),
        "labels": df_train["labels"].tolist()
    })
    
    dataset_val = Dataset.from_dict({
        "text": df_val["span"].tolist(), 
        "labels": df_val["labels"].tolist()
    })
    
    dataset_test = Dataset.from_dict({
        "text": df_test["span"].tolist(),
        "labels": df_test["labels"].tolist()
    })

    # ---------------------------
    #   –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–Ø
    # ---------------------------

    tokenizer = AutoTokenizer.from_pretrained(cfg.model.name)

    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=cfg.model.max_length
        )
        return tokenized

    tokenized_train = dataset_train.map(tokenize_function, batched=True)
    tokenized_val = dataset_val.map(tokenize_function, batched=True)
    tokenized_test = dataset_test.map(tokenize_function, batched=True)

    tokenized_train = tokenized_train.remove_columns(['text'])
    tokenized_val = tokenized_val.remove_columns(['text'])
    tokenized_test = tokenized_test.remove_columns(['text'])

    print("–ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞–Ω–Ω—ã—Ö –ø–æ—Å–ª–µ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏:")
    print(f"Train columns: {tokenized_train.column_names}")
    print(f"Sample train: {tokenized_train[0]}")

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        max_length=cfg.model.max_length,
        return_tensors="pt"
    )

    # ---------------------------
    #   –ò–ù–ò–¶–ò–ê–õ–ò–ó–ê–¶–ò–Ø –ú–û–î–ï–õ–ò
    # ---------------------------

    num_labels = len(encoder.classes_)

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model.name,
        num_labels=num_labels,
        id2label={i: label for i, label in enumerate(encoder.classes_)},
        label2id={label: i for i, label in enumerate(encoder.classes_)},
        hidden_dropout_prob=cfg.model.dropout.hidden,
        attention_probs_dropout_prob=cfg.model.dropout.attention,
        classifier_dropout=cfg.model.dropout.classifier
    )

    # ---------------------------
    #   TRAINING ARGUMENTS
    # ---------------------------

    training_args = TrainingArguments(
        output_dir=cfg.training.output_dir,
        num_train_epochs=cfg.training.num_train_epochs,
        learning_rate=cfg.training.learning_rate,
        per_device_train_batch_size=cfg.training.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.training.per_device_eval_batch_size,
        warmup_ratio=cfg.training.warmup_ratio,
        lr_scheduler_type=cfg.training.lr_scheduler_type,
        eval_strategy=cfg.training.eval_strategy,
        save_strategy=cfg.training.save_strategy,
        weight_decay=cfg.training.weight_decay,
        logging_steps=cfg.training.logging_steps,
        save_total_limit=cfg.training.save_total_limit,
        max_grad_norm=cfg.training.max_grad_norm,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
        remove_unused_columns=True
    )

    # ---------------------------
    #   –ú–ï–¢–†–ò–ö–ò
    # ---------------------------

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)

        return {
            "f1": f1_score(labels, preds, average="weighted"),
            "accuracy": accuracy_score(labels, preds)
        }

    # ---------------------------
    #   –û–ë–£–ß–ï–ù–ò–ï
    # ---------------------------

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.training.early_stopping_patience)]
    )

    print("üöÄ –ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ...")
    trainer.train()

    # ---------------------------
    #   –û–¶–ï–ù–ö–ê –ò –õ–û–ì–ò–†–û–í–ê–ù–ò–ï
    # ---------------------------
    
    print("–û—Ü–µ–Ω–∏–≤–∞–µ–º –º–æ–¥–µ–ª—å –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–º –Ω–∞–±–æ—Ä–µ...")
    test_predictions = trainer.predict(tokenized_test)
    test_preds = np.argmax(test_predictions.predictions, axis=1)
    test_labels = test_predictions.label_ids
    
    test_f1 = f1_score(test_labels, test_preds, average="weighted")
    test_accuracy = accuracy_score(test_labels, test_preds)
    
    mlflow.log_metric("f1_score", test_f1)
    mlflow.log_metric("accuracy", test_accuracy)
    
    print(f"–§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–µ:")
    print(f"F1-score: {test_f1:.4f}")
    print(f"Accuracy: {test_accuracy:.4f}")

    # ---------------------------
    #   –°–û–•–†–ê–ù–ï–ù–ò–ï
    # ---------------------------

    model_dir = "best_transformer_model"
    tokenizer_dir = "best_transformer_tokenizer"

    trainer.save_model(model_dir)
    tokenizer.save_pretrained(tokenizer_dir)

    mlflow.log_artifacts(model_dir, artifact_path="model")
    mlflow.log_artifacts(tokenizer_dir, artifact_path="tokenizer")

    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(encoder, f)
    mlflow.log_artifact('label_encoder.pkl', artifact_path="preprocessing")

    import shutil
    shutil.rmtree(model_dir)
    shutil.rmtree(tokenizer_dir)
    os.remove('label_encoder.pkl')

    print("–¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!")

‚úÖ –ù–∞–π–¥–µ–Ω run —Å –¥–∞—Ç–∞—Å–µ—Ç–æ–º: 327f7f34ba914fce83ca6c0e787e8f15


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.53it/s]


‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –∏–∑ run: C:\Users\Smart\AppData\Local\Temp\tmpkmxff0ir\third_experiment_dataset.csv
üìä –†–∞–∑–º–µ—Ä—ã –¥–∞–Ω–Ω—ã—Ö: Train=3216, Val=402, Test=402


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3216/3216 [00:00<00:00, 8500.23 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 402/402 [00:00<00:00, 7787.79 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 402/402 [00:00<00:00, 8308.13 examples/s]


üîç –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞–Ω–Ω—ã—Ö –ø–æ—Å–ª–µ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏:
Train columns: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Sample train: {'labels': 3, 'input_ids': [101, 110, 55795, 37312, 107, 54557, 71854, 14236, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ –ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ...




Epoch,Training Loss,Validation Loss,F1,Accuracy
1,1.2991,1.166594,0.591704,0.629353
2,0.6754,0.634449,0.794025,0.79602
3,0.3396,0.540969,0.832404,0.833333
4,0.1027,0.585891,0.849085,0.853234
5,0.0814,0.643048,0.857232,0.858209
6,0.0689,0.622935,0.869776,0.870647
7,0.0308,0.679291,0.867847,0.868159
8,0.0239,0.656416,0.868577,0.870647




üìä –û—Ü–µ–Ω–∏–≤–∞–µ–º –º–æ–¥–µ–ª—å –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–º –Ω–∞–±–æ—Ä–µ...




üéØ –§–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–µ:
   F1-score: 0.8191
   Accuracy: 0.8259
‚úÖ –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω –∏ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω!
üèÉ View run transformers_experiment_3 at: http://127.0.0.1:8080/#/experiments/0/runs/948c02959d5a46b798a981a964d40f83
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


#### –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ (–∏–Ω—Ñ–µ—Ä–µ–Ω—Å)

In [None]:
cfg = load_config("inference_transformers_third")

client = MlflowClient(tracking_uri=cfg.mlflow.tracking_uri)

# ================================
# 2. –ò—â–µ–º –Ω—É–∂–Ω—ã–π run –≤ MLflow
# ================================
print("–ò—â–µ–º run —Å –º–æ–¥–µ–ª—å—é...")

runs = client.search_runs(
    experiment_ids=[cfg.mlflow.experiment_id],
    filter_string=f'attributes.run_name = "{cfg.model.run_name}"',
    order_by=["attributes.end_time desc"]
)

if not runs:
    raise ValueError(f"Run '{cfg.model.run_name}' –Ω–µ –Ω–∞–π–¥–µ–Ω!")

run_id = runs[0].info.run_id
print(f"–ù–∞–π–¥–µ–Ω run: {run_id}")

# ================================
# 3. –°–∫–∞—á–∏–≤–∞–µ–º –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã
# ================================
print("–°–∫–∞—á–∏–≤–∞–µ–º –º–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä...")

try:
    model_dir = client.download_artifacts(run_id, cfg.model.artifacts_path)
    tokenizer_dir = client.download_artifacts(run_id, cfg.model.tokenizer_path)
    encoder_path = client.download_artifacts(run_id, "preprocessing/label_encoder.pkl")
    
    print(f"–ú–æ–¥–µ–ª—å —Å–∫–∞—á–∞–Ω–∞: {model_dir}")
    print(f"–¢–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Å–∫–∞—á–∞–Ω: {tokenizer_dir}")
    print(f"–≠–Ω–∫–æ–¥–µ—Ä —Å–∫–∞—á–∞–Ω: {encoder_path}")
    
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç–æ–≤: {e}")
    raise

# ================================
# 4. –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –≤ –ø–∞–º—è—Ç—å
# ================================
print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –∏ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã...")

model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

with open(encoder_path, 'rb') as f:
    encoder = pickle.load(f)

print("–í—Å–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")

# ================================
# 5. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
# ================================
text = cfg.test_text
print(f"–¢–µ—Å—Ç–æ–≤—ã–π —Ç–µ–∫—Å—Ç: ¬´{text}¬ª")

tokenizer_cfg = {k: v for k, v in cfg.tokenizer.items() if v is not None}

inputs = tokenizer(text, **tokenizer_cfg)

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

pred_idx = probs.argmax().item()
pred_prob = probs[0][pred_idx].item()

pred_label = encoder.inverse_transform([pred_idx])[0]

print("\n" + "="*100)
print(f"–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: ¬´{text}¬ª")
print(f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {pred_label}")
print(f"–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: {pred_prob:.4f}")
print("\n–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:")
for i, p in enumerate(probs[0]):
    label = encoder.inverse_transform([i])[0]
    print(f"  {label}: {p:.4f}")
print("="*100)

–ò—â–µ–º run —Å –º–æ–¥–µ–ª—å—é...
‚úÖ –ù–∞–π–¥–µ–Ω run: 948c02959d5a46b798a981a964d40f83
–°–∫–∞—á–∏–≤–∞–µ–º –º–æ–¥–µ–ª—å, —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏ —ç–Ω–∫–æ–¥–µ—Ä...


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [02:09<00:00, 18.56s/it]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:14<00:00,  3.70s/it]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 34.49it/s]


üìÅ –ú–æ–¥–µ–ª—å —Å–∫–∞—á–∞–Ω–∞: C:\Users\Smart\AppData\Local\Temp\tmp1ut_wjcm\model
üìÅ –¢–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä —Å–∫–∞—á–∞–Ω: C:\Users\Smart\AppData\Local\Temp\tmpv5m85rm0\tokenizer
üìÅ –≠–Ω–∫–æ–¥–µ—Ä —Å–∫–∞—á–∞–Ω: C:\Users\Smart\AppData\Local\Temp\tmpstx4jzoy\label_encoder.pkl
–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –∏ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã...


The tokenizer you are loading from 'C:\Users\Smart\AppData\Local\Temp\tmpv5m85rm0\tokenizer' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


‚úÖ –í—Å–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–¢–µ—Å—Ç–æ–≤—ã–π —Ç–µ–∫—Å—Ç: ¬´–ú–æ–∂–Ω–æ –±—ã–ª–æ —Å–¥–µ–ª–∞—Ç—å —á–∏–ø—Å—ã –∏ –ø–æ–≤–∫—É—Å–Ω–µ–µ¬ª

–ò—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç: ¬´–ú–æ–∂–Ω–æ –±—ã–ª–æ —Å–¥–µ–ª–∞—Ç—å —á–∏–ø—Å—ã –∏ –ø–æ–≤–∫—É—Å–Ω–µ–µ¬ª
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è –º–µ—Ç–∫–∞: –í–ö–£–°_NEUTRAL
–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: 0.7908

–í—Å–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:
  O: 0.0010
  –í–ö–£–°_NEGATIVE: 0.0021
  –í–ö–£–°_NEUTRAL: 0.7908
  –í–ö–£–°_POSITIVE: 0.1538
  –ü–ê–ß–ö–ê_NEGATIVE: 0.0008
  –ü–ê–ß–ö–ê_NEUTRAL: 0.0012
  –ü–ê–ß–ö–ê_POSITIVE: 0.0004
  –¢–ï–ö–°–¢–£–†–ê_NEGATIVE: 0.0018
  –¢–ï–ö–°–¢–£–†–ê_NEUTRAL: 0.0021
  –¢–ï–ö–°–¢–£–†–ê_POSITIVE: 0.0459


# –í—ã–±–æ—Ä –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ (–¥–ª—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞)

In [None]:
client = MlflowClient()

runs = client.search_runs(experiment_ids = ['0'],
                          order_by = ['metrics.f1_score desc', 'metrics.accuracy desc'])

best_run = runs[0]


best_run_id = best_run.info.run_id

model_loc = client.download_artifacts(best_run_id, path='model')
tokenizer_loc = client.download_artifacts(best_run_id, path='tokenizer')

tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
model = AutoModelForSequenceClassification.from_pretrained(model_loc)


model_final = pipeline('text-classification', model = model, tokenizer = tokenizer)

print(model_final(['–í–∫—É—Å –ø—Ä–æ—Å—Ç–æ –±–æ–º–±–∏—á–µ—Å–∫–∏–π']))



Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [01:59<00:00, 17.08s/it]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 37.83it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:13<00:00,  3.33s/it] 
The tokenizer you are loading from 'C:\Users\Smart\AppData\Local\Temp\tmpgi_nhox8\tokenizer' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Device set to use cpu


[{'label': '–í–ö–£–°_NEGATIVE', 'score': 0.9860121607780457}]


In [11]:
output_dir_model = './best_model/model'
output_dir_tokenizer = './best_model/tokenizer'

model.save_pretrained(output_dir_model)
tokenizer.save_pretrained(output_dir_tokenizer)

('./best_model/tokenizer\\tokenizer_config.json',
 './best_model/tokenizer\\special_tokens_map.json',
 './best_model/tokenizer\\vocab.txt',
 './best_model/tokenizer\\added_tokens.json',
 './best_model/tokenizer\\tokenizer.json')

In [16]:
model_test = AutoModelForSequenceClassification.from_pretrained('best_model\\model')
tokenizer_test = AutoTokenizer.from_pretrained('best_model\\tokenizer')


pipe = pipeline('text-classification', model = model_test, tokenizer = tokenizer_test)


print(pipe(['–ß–∏–ø—Å—ã –Ω–µ –æ—á–µ–Ω—å']))

The tokenizer you are loading from 'best_model\tokenizer' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Device set to use cpu


[{'label': '–í–ö–£–°_NEGATIVE', 'score': 0.9977165460586548}]
