Работа сделана на основе статьи Yoon Kim [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf)

In [26]:
%matplotlib inline
import collections
import math
import numpy as np
import pandas as pd
import os
import io
import json
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from sklearn.model_selection import train_test_split
import tensorflow as tf

seed = 8765

%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


### Загрузка и предобработка данных
Загружаем данные в модель и проводим предобработку, для каждого экземпляра получаем:
* Вопрос
* Категория

In [3]:
data = pd.read_csv('latest_ticket_data.csv')

display(data.info)

<bound method DataFrame.info of                                             Description     Category
0     hi since recruiter lead permission approve req...  Application
1     re expire days hi ask help update passwords co...  Application
3     please dear looks blacklisted receiving mails ...  Application
4     dear modules report report cost thank much reg...  Application
...                                                 ...          ...
2995  sent tuesday feedback follow up conf call hell...     Database
2996  sent monday issues hi keep getting errors whil...     Database
2997  sent monday en working properly hi guys we hav...     Database
2998  sent wednesday july hi please log incident for...     Database
2999  sent tuesday july connection issues hello have...     Database

[3000 rows x 2 columns]>

### Конвертирование данных обучение/тестовые в pd.DataFrame

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2)

train_df.head(n=10)

Unnamed: 0,Description,Category
1266,re maternity leaver form hi please find attach...,User Maintenance
429,column added hi please rise assign thanks best...,Application
2909,sent thursday october vs problems hi have inst...,Database
2307,sent saturday event notification data event th...,Security
1599,wednesday hey va milk fruits fresh la care si ...,User Maintenance
2722,sent friday november issue with hi guys assist...,Database
2851,sent tuesday october printer error th floor to...,Database
1008,ports allow connecting hello please allow traf...,Network
2182,sent friday lost badge hello lost badge notice...,Security
1226,re maternity form hi please attached filled fo...,User Maintenance


In [5]:
train_df = train_df.sample(frac=1.0, random_state=seed)

### Конвертация строковых меток в целочисленные ID

In [6]:
unique_cats = train_df["Category"].unique()
labels_map = dict(zip(unique_cats, np.arange(unique_cats.shape[0])))
print(f"Label->ID mapping: {labels_map}")

n_classes = len(labels_map)

train_df["Category"] = train_df["Category"].map(labels_map)
test_df["Category"] = test_df["Category"].map(labels_map)

train_df.head(n=10)

Label->ID mapping: {'Security': 0, 'User Maintenance': 1, 'Database': 2, 'Application': 3, 'Network': 4}


Unnamed: 0,Description,Category
1986,access doors dear please investigate with high...,0
1235,badge needed floor hi please floor thank,1
2984,sent friday working issue hi having similar is...,2
2390,sent october lost access card hi our colleague...,0
526,database backup needed hello backup database k...,3
2975,sent monday trouble connecting monitor work st...,2
721,purchase po dear purchased cable type apple li...,2
2551,sent friday availability importance high hi so...,4
1723,friday fwd personal weekend bun conform solici...,1
2959,sent friday recovery key dear trouble with his...,2


### Разделение данных на группы для обучения и валидации

In [7]:
train_df, valid_df = train_test_split(train_df, test_size=0.2)
print(f"Train size: {train_df.shape}")
print(f"Valid size: {valid_df.shape}")

train_df.head()

Train size: (1920, 2)
Valid size: (480, 2)


Unnamed: 0,Description,Category
2151,sent friday lost badge hello one our colleague...,0
2434,sent thursday march re ny speed attached north...,4
1978,lost access card hi lost access card could you...,0
2427,sent tuesday march issue hi could you please h...,4
2173,sent friday lost badge hello today around lost...,0


### Токенизация
Определение токенизатора с обучаемой выборкой и размера словаря, который соответствует размеру словаря `index_word`

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a tokenizer and fit on train data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df["Description"].tolist())

# Derive the vocabulary size
n_vocab = len(tokenizer.index_word) + 1
print(f"Vocabulary size: {n_vocab}")

Vocabulary size: 4365


### Длина последовательности
Здесь проводим анализ `1%` и `99%` персентиля последовательностей. Используем `99%` персентиля как максимальная длина последовательности

In [9]:
train_df["Description"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    1920.000000
mean       35.926042
std        57.074974
min         1.000000
1%          3.000000
50%        21.000000
99%       260.150000
max       927.000000
Name: Description, dtype: float64

### Выравнивание коротких предложений
Выравниваем, чтобы все предложения были одинаковой длины

In [10]:
train_sequences = tokenizer.texts_to_sequences(train_df["Description"].tolist())
train_labels = train_df["Category"].values
valid_sequences = tokenizer.texts_to_sequences(valid_df["Description"].tolist())
valid_labels = valid_df["Category"].values
test_sequences = tokenizer.texts_to_sequences(test_df["Description"].tolist())
test_labels = test_df["Category"].values

max_seq_length = 279

preprocessed_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq_length, padding='post', truncating='post')
preprocessed_valid_sequences = tf.keras.preprocessing.sequence.pad_sequences(valid_sequences, maxlen=max_seq_length, padding='post', truncating='post')
preprocessed_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq_length, padding='post', truncating='post')

### Классификация предложений при помощи CNN
Реализация простой CNN для классификации предложений (документов). Нейронная сеть имеет один слой, за ним следует слой `pooling-over-time` (термин из статьи и перевод неизвестен) и полносвязный слой с функцией активации `softmax`.

In [11]:
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model

K.clear_session()

# Input layer takes word IDs as inputs
word_id_inputs = layers.Input(shape=(max_seq_length,), dtype='int32')

# Embeddings of the inputs / out [batch_size, sent_length, output_dim]
embedding_out = layers.Embedding(input_dim=n_vocab, output_dim=80)(word_id_inputs)


# All layers: in [batch_size, sent_length, emb_size] / out [batch_size, sent_length, 150]
conv1_1 = layers.Conv1D(
    150, kernel_size=5, strides=1, padding='same', activation='relu'
)(embedding_out)
conv1_2 = layers.Conv1D(
    150, kernel_size=7, strides=1, padding='same', activation='relu'
)(embedding_out)
conv1_3 = layers.Conv1D(
    150, kernel_size=11, strides=1, padding='same', activation='relu'
)(embedding_out)

# in previous conve outputs / out [batch_size, sent_length, 450]
conv_out = layers.Concatenate(axis=-1)([conv1_1, conv1_2, conv1_3])

# Pooling over time operation. This is doing the max pooling over sequence lenth
# in other words, each feature map results in a single output
# in [batch_size, sent_length, 450] / out [batch_size, 1, 450]
pool_over_time_out = layers.MaxPool1D(pool_size=max_seq_length, padding='valid')(conv_out)

# Flatten the unit length dimension
flatten_out = layers.Flatten()(pool_over_time_out)

# Compute the final output
out = layers.Dense(
    n_classes, activation='softmax',
    kernel_regularizer=regularizers.l2(0.001)
)(flatten_out)

# Define the model
cnn_model = Model(inputs=word_id_inputs, outputs=out)

# Compile the model with loss/optimzier/metrics
cnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

cnn_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 279)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 279, 80)      349200      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 279, 150)     60150       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 279, 150)     84150       embedding[0][0]                  
______________________________________________________________________________________________

2023-08-30 23:06:43.661260: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-30 23:06:43.662359: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-08-30 23:06:43.882223: E tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:927] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-30 23:06:43.882323: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:07:00.0 name: NVIDIA GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2023-08-30 23:06:43.882347: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-30 23:06:43.883471: I tensorflow/stream_executor/

### Обучение модели
Обучение проводится с определенным значением батча по каждой эпохе. Используется TF колбэк `ReduceLROnPlateau` - уменьшает коэфициент обучения (learning rate) если нет улучшений в оценке качества модели.

In [12]:
# Callback
lr_reduce_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=3, verbose=1,
    mode='auto', min_delta=0.0001, min_lr=0.000001
)

cnn_model.fit(
    preprocessed_train_sequences, train_labels,
    validation_data=(preprocessed_valid_sequences, valid_labels),
    batch_size=128,
    epochs=25,
    callbacks=[lr_reduce_callback]
)

2023-08-30 23:06:44.810291: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-08-30 23:06:44.811171: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3792910000 Hz


Epoch 1/25


2023-08-30 23:06:45.271109: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-08-30 23:06:45.487498: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 11/25
Epoch 12/25
Epoch 13/25

Epoch 00013: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 14/25
Epoch 15/25
Epoch 16/25

Epoch 00016: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 17/25
Epoch 18/25
Epoch 19/25

Epoch 00019: ReduceLROnPlateau reducing learning rate to 1e-06.
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f61c44a9d90>

### Проверка модели на тестовой выборке

In [13]:
cnn_model.evaluate(preprocessed_test_sequences, test_labels, return_dict=True)



{'loss': 0.7616634964942932, 'accuracy': 0.7616666555404663}

In [14]:
cnn_model.save('cnn_model', save_format='tf')

2023-08-30 23:07:01.337214: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: cnn_model/assets


In [17]:
preprocessed_test_sequences.shape

(600, 279)

In [31]:
preprocessed_test_sequences[2]

array([1251,    5,    1,   78, 1251,  110,    4,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [30]:
test_df["Description"].tolist()[2]

'charger hello please provide charger today thank'

In [32]:
test_labels[2]

2

In [28]:
with open('word_dict.json' , 'w') as file:
    json.dump(tokenizer.word_index, file)

In [34]:
test_df.to_csv('test_df.csv', index=False)