## Обучение модели для классификации намерения пользователя

### Импорт модулей


In [23]:
!pip install deeppavlov
!pip install pytorch_lightning
!pip install torch torchvision torchaudio torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
import pandas as pd
import numpy as np
import transformers
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

### Данные

In [3]:
data = pd.read_csv('/content/intentData.csv', delimiter=';',encoding="utf-8", names=['text', 'intent'])
NUM_LABELS = len(set(data['intent']))
labels = set(data['intent'])
id2label = {i:label for i,label in enumerate(labels) if label != None}
label2id = {label:i for i,label in enumerate(labels) if label != None}

print(list(set(data['intent'].values)))
print(list(data['intent'].value_counts()))
df2 = pd.DataFrame(data = {"intent name" : list(set(data['intent'].values)),
                           "number of utterance" : list(data['intent'].value_counts())})

['book_room', 'common_qstn', 'get_room', 'smalltalk_greetings', 'smalltalk_bye']
[84, 74, 29, 22, 22]


In [4]:
# посмотрим, что лежит в датасете
data[data["intent"] == "get_room"].iloc[:3]

Unnamed: 0,text,intent
148,"Да, я хочу у вас остановиться.",get_room
149,Мне нужен номер.,get_room
150,Я думаю у вас для меня должен быть номер.,get_room


In [5]:
#  вернем количестов меток и соответствующих объектов в датасете
def get_labels_and_counts(sample):
    labels = list(set(sample['intent'].values))
    counts = []
    for intent in labels:
        counts.append(sample[sample["intent"] == intent].count()[1])

    return labels, counts

In [6]:
labels, counts = get_labels_and_counts(data)
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for i in range(5):
    df2 = data[data["intent"] == labels[i]].sample(frac = 1)
    tmp_df = df2.iloc[:int(np.round(counts[i] * 0.7))]
    tmp_df2 = df2.iloc[int(np.round(counts[i] * 0.7)):]
    train_df = pd.concat([train_df, tmp_df])
    test_df = pd.concat([test_df, tmp_df2])
    
test_df['intent'].value_counts()

smalltalk_bye          25
smalltalk_greetings    22
common_qstn             9
book_room               7
get_room                7
Name: intent, dtype: int64

In [7]:
# чуть-чуть визуализации (для всего датасета)
import plotly.graph_objects as go
import plotly.express as px

labels = list(set(data['intent'].values))
values = list(data['intent'].value_counts())

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3, marker_colors = px.colors.sequential.Agsunset)])
fig.update_layout(title_x = 0.5, width = 800, height = 800, uniformtext_minsize = 20, uniformtext_mode='hide', legend_font_size = 20)
fig.show()

In [8]:
# визуализация для тренировочной выборки
import plotly.graph_objects as go

labels, values = get_labels_and_counts(train_df)

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3, marker_colors = px.colors.sequential.Agsunset)])
fig.update_layout(title_x = 0.5, width = 800, height = 800, uniformtext_minsize = 20, uniformtext_mode='hide', legend_font_size = 20)
fig.show()

In [9]:
# сбалансируем классы
rat = []
for i in ["get_room", "book_room", "common_qstn"]:
    rat.append(len(train_df.loc[train_df['intent'] == "smalltalk_greetings"])//len(train_df.loc[train_df['intent'] == i]))
rat


[3, 3, 2]

In [10]:
train_balanced = train_df.loc[train_df['intent'] == "smalltalk_greetings"]
train_balanced = pd.concat([train_balanced, train_df.loc[train_df['intent'] == "smalltalk_bye"]])
imbalanced_intent = ["get_room", "book_room", "common_qstn"]
for i in range(3):
    df_1 = train_df.loc[train_df['intent']==imbalanced_intent[i]]
    df_1 = df_1.loc[df_1.index.repeat(rat[i])]
    train_balanced = pd.concat([train_balanced, df_1]).sample(frac=1)

train_balanced = train_balanced.reset_index()
train_balanced

Unnamed: 0,index,text,intent
0,111,Мы замечательно поболтали,smalltalk_bye
1,120,"Спасибо за замечательное общение, до свидания",smalltalk_bye
2,153,я хотел бы заселиться,get_room
3,53,"Привет, друг",smalltalk_greetings
4,156,мне нужна комната,get_room
...,...,...,...
236,216,Какая вам разница?,common_qstn
237,189,а может бронь не на мое имя?,book_room
238,114,Мы отлично поболтали,smalltalk_bye
239,80,"Ну ладно, приятно было поговорить, пока)",smalltalk_bye


In [11]:
# аналогично для тестовой выборки
rat = []
for i in ["get_room", "book_room", "common_qstn"]:
    rat.append(len(test_df.loc[test_df['intent'] == "smalltalk_greetings"])//len(test_df.loc[test_df['intent'] == i]))
rat

[3, 3, 2]

In [12]:
test_balanced = test_df.loc[test_df['intent'] == "smalltalk_greetings"]
test_balanced = pd.concat([test_balanced, test_df.loc[test_df['intent'] == "smalltalk_bye"]])
imbalanced_intent = ["get_room", "book_room", "common_qstn"]
for i in range(3):
    df_1 = test_df.loc[test_df['intent']==imbalanced_intent[i]]
    df_1 = df_1.loc[df_1.index.repeat(rat[i])]
    test_balanced = pd.concat([test_balanced, df_1]).sample(frac=1)

test_balanced = test_balanced.reset_index()
test_balanced

Unnamed: 0,index,text,intent
0,149,Мне нужен номер.,get_room
1,123,"Всего доброго, до свидания!",smalltalk_bye
2,204,я не тороплюсь,common_qstn
3,185,"телефон разрядился, не могу посмотреть номер б...",book_room
4,76,"Ладно, до встречи",smalltalk_bye
...,...,...,...
102,78,"Пока, свяжемся позже!",smalltalk_bye
103,146,всего хорошего,smalltalk_bye
104,22,приветствую,smalltalk_greetings
105,28,"Привет, дружище",smalltalk_greetings


In [13]:
# визуализация после балансировки
import plotly.graph_objects as go

labels, values = get_labels_and_counts(train_balanced)

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3, marker_colors = px.colors.sequential.Agsunset)])
fig.update_layout(title_x = 0.5, width = 800, height = 800, uniformtext_minsize = 20, uniformtext_mode='hide', legend_font_size = 20)
fig.show()

In [14]:
train_balanced["intent"].value_counts()

smalltalk_bye          59
smalltalk_greetings    52
get_room               45
book_room              45
common_qstn            40
Name: intent, dtype: int64

In [15]:
new_data = pd.concat([train_balanced, test_balanced]).reset_index()
labels=set(new_data['intent'])
id2label = {i:label for i,label in enumerate(labels) if label != None}
label2id = {label:i for i,label in enumerate(labels) if label != None}
NUM_LABELS=len(set(new_data['intent']))

### Модель

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
MODEL_NAME = 'DeepPavlov/rubert-base-cased-sentence'
N_MODELS = 3
tokenizers = []
models = []
for i in range(N_MODELS):
    tokenizers.append(AutoTokenizer.from_pretrained(MODEL_NAME))
    models.append(AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,label2id=label2id,id2label=id2label,num_labels = NUM_LABELS))
None

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

def make_dataset(df):
    labels = [label2id[label] for label in df['intent']]
    texts = [tokenizer(text,padding='max_length', max_length = 32, truncation=True) for text in df['text']]
    
    result = []
    for idx,(text,label) in enumerate(zip(texts,labels)):
        result.append({"input_ids" : text['input_ids'], 'attention_mask' : text['attention_mask'], "label" : label})
    return result

train_dataset = make_dataset(train_balanced)
test_dataset = make_dataset(test_balanced)

In [31]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels), 
            'f1-macro' : f1_score(labels, predictions, average = 'macro')}

In [36]:
from pytorch_lightning.callbacks import Callback

class MetricTracker(Callback):

  def __init__(self):
    self.collection = []

  def on_validation_batch_end(self, trainer, module, outputs):
    vacc = outputs['Accuracy'] 
    self.collection.append(vacc) 

  def on_validation_epoch_end(self, trainer, module):
    elogs = trainer.logged_metrics 
    self.collection.append(elogs)
    

cb = MetricTracker()


In [37]:
import gc
import torch
gc.collect()

torch.cuda.empty_cache()

In [39]:
import os
from transformers import Trainer, TrainingArguments
os.environ["WANDB_DISABLED"] = "true"

my_models = models.copy()

for model in my_models:
    args = TrainingArguments(
    output_dir="hotel",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.05,
    load_best_model_at_end=False,
    report_to=None,
    metric_for_best_model = 'accuracy',
    logging_dir = "hotel/logs",
    logging_steps = 5,
    eval_steps = 5
    )
    trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )
    trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1-macro
5,1.5702,1.493772,0.448598,0.353581
10,1.4201,1.32782,0.588785,0.566505
15,1.2527,1.200302,0.635514,0.612118
20,1.018,1.094512,0.64486,0.620334
25,0.9183,0.985573,0.747664,0.735943
30,0.7065,0.872123,0.747664,0.735337
35,0.6,0.794417,0.747664,0.737031
40,0.4932,0.729163,0.766355,0.754085
45,0.4605,0.68095,0.785047,0.778282
50,0.3348,0.654174,0.794393,0.786064


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).




Step,Training Loss,Validation Loss,Accuracy,F1-macro
5,1.573,1.487965,0.457944,0.401863
10,1.4263,1.357978,0.635514,0.621983
15,1.2512,1.259676,0.570093,0.525817
20,1.0412,1.171721,0.579439,0.558384
25,0.9415,1.059602,0.654206,0.637652
30,0.7397,0.930211,0.757009,0.73114
35,0.5848,0.869016,0.719626,0.686608
40,0.5439,0.822779,0.682243,0.658352
45,0.4734,0.779571,0.719626,0.697567
50,0.3325,0.740608,0.728972,0.714931


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).




Step,Training Loss,Validation Loss,Accuracy,F1-macro
5,1.5652,1.474841,0.373832,0.305375
10,1.4039,1.315665,0.654206,0.635556
15,1.2136,1.169149,0.682243,0.650808
20,1.0018,1.077135,0.663551,0.628963
25,0.8833,0.983903,0.728972,0.71212
30,0.6759,0.874395,0.757009,0.739512
35,0.5243,0.818094,0.719626,0.678155
40,0.4746,0.767926,0.747664,0.702349
45,0.3996,0.705451,0.757009,0.736894
50,0.2761,0.664748,0.803738,0.792542


In [35]:
print(transformers.__version__)

4.27.4


In [40]:
import torch

In [46]:
metrics=pd.read_csv('/content/trainMetrics.csv', delimiter=';',encoding="utf-8", names=['Step', 'Training Loss', 'Validation Loss', 'Accuracy', 'F1-Macro'])
metrics = metrics.iloc[1:]
metrics = metrics.astype(np.float64)

In [47]:
fig = px.line(metrics, x = "Step", y = ["Validation Loss", "Training Loss"])
fig.update_layout(width = 900, height = 600, legend_font_size = 20, legend_title = "Loss")
fig.update_xaxes(tickfont_size = 18, title_font_size = 20)
fig.update_yaxes(title = 'Loss', tickfont_size = 18, title_font_size = 20)
fig.show()

In [48]:
fig = px.line(metrics, x = "Step", y = ["Accuracy", "F1-Macro"])
fig.update_layout(width = 900, height = 600, legend_font_size = 20, legend_title = "Metrics")
fig.update_xaxes(tickfont_size = 18, title_font_size = 20)
fig.update_yaxes(title = "Metric's Value", tickfont_size = 18, title_font_size = 20)
fig.show()

In [49]:
model = my_models[1]
torch.save(model, "intent_catcher.pt")

In [50]:
model.to("cuda")
model_input = tokenizer.encode("Да, я хочу у вас остановиться.	", return_tensors='pt').to("cuda")
model_output = model.bert.config.id2label[model(model_input)['logits'].argmax().item()]
model_output

'get_room'

In [51]:
reference = []
predictions = []
for text,label in zip(test_balanced['text'],test_balanced['intent']):
    model_input = tokenizer.encode(text, return_tensors='pt').to("cuda")
    model_output = model.bert.config.id2label[model(model_input)['logits'].argmax().item()]
    predictions.append(model_output)
    reference.append(label)


In [52]:
from sklearn.metrics import f1_score

for average_type in ['macro','micro','weighted']:
    print(f'{average_type} F1 = {f1_score(reference, predictions, average=average_type)}')

macro F1 = 0.8237536286479556
micro F1 = 0.8317757009345794
weighted F1 = 0.8296612511160347
