In this notebook we will fit a model to predict the `inquire_product` label using the embeddings of the text from the LABSE model.

In [1]:
import pandas as pd
import numpy as np
import yaml
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sentence_transformers import SentenceTransformer

from catboost import CatBoostClassifier

import optuna


In [2]:
# Let's define some functions to load and process the data
def load_data_from_yaml(file_path: str) -> list[dict]:
    """
    Load data from yaml file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)
    return data

def history_to_string(history_list: list[dict]) -> str:
    """
    Convert a list of conversation dictionaries to a single string containing only the content and role.
    """
    return history_list[-1]['content']

def yaml_to_df(file_path: str) -> pd.DataFrame:
    """
    Convert yaml data to pandas dataframe.
    """
    data = load_data_from_yaml(file_path)
    df = pd.DataFrame(data)
    return df

In [3]:
# Loading data and preparing it for the model
df = yaml_to_df('../data/conversations.yaml')
df['history'] = df['history'].apply(history_to_string)
df = df.drop(columns=['messages_uuid'], axis=1)
df = df.drop_duplicates(subset=['history'])
df.sample(n=10)

Unnamed: 0,history,inquire_product
27,"да, добавь сыр маасдам любой",False
3,добавь укроп,False
18,"свечи да, давай поромантичнее",False
0,"привет, добавь мне сок яблочный 1 л",False
23,да добавь,False
80,первый,False
42,"да, добавь петелинку",False
103,вот да это всё в наборе,False
24,добавь в корзину овсяное печенье 3 пачки,False
92,можно батончик или что-то ещё,False


In [4]:
positive_df = df[df['inquire_product'] == True]
negative_df = df[df['inquire_product'] == False]

# Add all positive samples from the real data to the test dataset and 10 negative samples
test_df = pd.concat([positive_df, negative_df.sample(n=13)])
train_df = df.drop(test_df.index)

In [5]:
# Load synthetic data and add it to the train dataset
with open('../data/synthetic_data.json', 'r', encoding='utf-8') as f:
    synthetic_data = json.load(f)
    
synthetic_df = pd.DataFrame(synthetic_data['questions'], columns=['history'])
synthetic_df['inquire_product'] = True

train_df = pd.concat([train_df, synthetic_df])

In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_df['history'], train_df['inquire_product'], test_size=0.4, random_state=1210)
X_test, y_test = test_df['history'], test_df['inquire_product']

# Generate embeddings for the text

In [9]:
embedder = SentenceTransformer('sentence-transformers/LaBSE').to('cuda')

In [10]:
X_train_embeddings = embedder.encode(X_train.tolist())
X_val_embeddings = embedder.encode(X_val.tolist())
X_test_embeddings = embedder.encode(X_test.tolist())

# Select the best parameters for the model

In [18]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 1000),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'eval_metric': trial.suggest_categorical('eval_metric', ['TotalF1', 'Logloss', 'CrossEntropy']),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 0, 10),
        'depth': trial.suggest_int('depth', 1, 8),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
    }
    model = CatBoostClassifier(random_seed=1210, verbose=0, class_weights=[1, 2], **params)
    model.fit(X_train_embeddings, y_train, eval_set=(X_val_embeddings, y_val))
    y_pred = model.predict(X_test_embeddings)
    return f1_score(y_test, y_pred, average='macro')

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, show_progress_bar=True)

print(f"Best parameters: {study.best_params}")
print(f"Best value: {study.best_value}")

  0%|          | 0/20 [00:00<?, ?it/s]

Best parameters: {'learning_rate': 0.005489278166788565, 'early_stopping_rounds': 805, 'iterations': 137, 'eval_metric': 'CrossEntropy', 'l2_leaf_reg': 10, 'depth': 7, 'random_strength': 0.6270957650683481}
Best value: 0.8400000000000001


# Fit the model with the best parameters

In [19]:
model = CatBoostClassifier(random_seed=1210,
                           learning_rate=study.best_params['learning_rate'],
                           early_stopping_rounds=study.best_params['early_stopping_rounds'],
                           iterations=study.best_params['iterations'],
                           verbose=500,
                           use_best_model=True,
                           eval_metric=study.best_params['eval_metric'],
                           class_weights=[1, 2],
                           l2_leaf_reg=study.best_params['l2_leaf_reg'],
                           depth=study.best_params['depth'],
                           random_strength=study.best_params['random_strength']
                           )
model.fit(X_train_embeddings, y_train, eval_set=(X_val_embeddings, y_val))

0:	learn: 0.6896646	test: 0.6903930	best: 0.6903930 (0)	total: 41.9ms	remaining: 5.7s
136:	learn: 0.3944515	test: 0.4588701	best: 0.4588701 (136)	total: 5.27s	remaining: 0us

bestTest = 0.4588701165
bestIteration = 136



<catboost.core.CatBoostClassifier at 0x706af4d78ef0>

In [20]:
y_pred = model.predict(X_test_embeddings)
print(classification_report(y_test, y_pred))
print(f"F1 score: {f1_score(y_test, y_pred)}")


              precision    recall  f1-score   support

       False       0.92      0.85      0.88        13
        True       0.75      0.86      0.80         7

    accuracy                           0.85        20
   macro avg       0.83      0.85      0.84        20
weighted avg       0.86      0.85      0.85        20

F1 score: 0.8
