In [1]:
import subprocess
import re
import numpy as np
import pandas as pd
import tensorflow as tf
pd.set_option('display.max_colwidth', 100)
from datetime import datetime, timedelta
from tqdm import tqdm

import os
import string
os.environ["JAVA_HOME"] = "/usr/local/jdk-11"

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split


from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import spyt
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import col, lit, broadcast
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
from clan_tools.data_adapters.YTAdapter import YTAdapter 
import pyspark.sql.dataframe as spd
import warnings
warnings.filterwarnings('ignore')

2021-09-28 17:00:41.839123: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-28 17:00:41.839173: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
spark = spyt.connect(spark_conf_args ={
      "spark.executor.memory": "6G",
      "spark.executor.cores": 2,
      "spark.sql.session.timeZone": "UTC",
      "spark.dynamicAllocation.maxExecutors": 6,
      "spark.dynamicAllocation.enabled":True,
      "spark.sql.autoBroadcastJoinThreshold":-1,
      "spark.cores.min":16,
      "spark.driver.memory": "4G",
      "spark.executor.instances":6,
      "spark.jars":'yt:///home/sashbel/graphframes-assembly-0.8.2-SNAPSHOT-spark3.0.jar',
})
# spyt.info(spark)
# Enable Arrow-based columnar data 
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

21/09/28 17:00:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/09/28 17:01:07 WARN Utils: Service 'sparkDriver' could not bind on port 27001. Attempting port 27002.
21/09/28 17:01:07 WARN Utils: Service 'sparkDriver' could not bind on port 27002. Attempting port 27003.
21/09/28 17:01:07 WARN Utils: Service 'sparkDriver' could not bind on port 27003. Attempting port 27004.
21/09/28 17:01:07 WARN Utils: Service 'SparkUI' could not bind on port 27001. Attempting port 27002.
21/09/28 17:01:07 WARN Utils: Service 'SparkUI' could not bind on port 27002. Attempting port 27003.
21/09/28 17:01:07 WARN Utils: Service 'SparkUI' could not bind on port 27003. Attempting port 27004.
21/09/28 17:01:07 WARN Utils: Service 'SparkUI' could not bind on port 27004. Attempting port 27005.
21/09/28

In [3]:
import pymorphy2
import re

ma = pymorphy2.MorphAnalyzer()
maxlen=100


def num_digits(s):
    return sum(c.isdigit() for c in s)


def clean_text(text, words_count=maxlen):
    text = text.replace("\\", " ")
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', ' ', text)
    text = re.sub(
        '[.,:;_%©?*,!@#$%^&(){{}}]|[+=]|[«»]|[<>]|[\']|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    text = ' '.join(word for word in text.split() if len(word) > 2)
    text = ' '.join(word for word in text.split() if not word.isnumeric())
    text = ' '.join(word for word in text.split() if num_digits(word)<=2)
    text = " ".join(ma.parse(word)[0].normal_form for word in text.split())
    words = text.split()[:words_count]
    text = ' '.join(words)
    return text
    

clean_text_udf = F.udf(clean_text, returnType=T.StringType())

2021-09-28 17:01:45,406 - INFO - pymorphy2.opencorpora_dict.wrapper - Loading dictionaries from /home/albina-volk/miniconda3/envs/py37/lib/python3.7/site-packages/pymorphy2_dicts_ru/data
2021-09-28 17:01:45,469 - INFO - pymorphy2.opencorpora_dict.wrapper - format: 2.4, revision: 417127, updated: 2020-10-11T15:05:51.070345


In [12]:
START_DATE = "01-09-2019"
END_DATE = "01-09-2021"
PATH_TO_RESULTING_MODEL_DATA = '../src/support_tickets_classification/data/model'

## Data collecting

In [5]:
support_issues_path="//home/startrek/tables/prod/yandex-team/queue/CLOUDSUPPORT/issues"
tickets_prod_path="//home/cloud/billing/exported-support-tables/tickets_prod"
components_path="//home/startrek/tables/prod/yandex-team/queue/CLOUDSUPPORT/components"
components_white_list_path = "//home/cloud_analytics/ml/support_tickets_classification/components_white_list"

In [6]:
issues = (
    spark.read
    .schema_hint({'components': T.ArrayType(T.StringType())})
    .yt(support_issues_path)
    .select('key', F.explode('components').alias('components'))
)

tickets_prod = (
    spark.read
    .yt(tickets_prod_path)
    .select('description', 'summary', 'st_key', 'iam_user_id', 'created_at')
)

components = (
    spark.read.yt(components_path)
    .select('id',
            col('name').alias('component_name'),
            col('shortId').alias('component_short_id'))
)

tickets_flat = (
    tickets_prod
    .join(issues, on=tickets_prod.st_key == issues.key)
    .join(components, on=issues.components == components.id)
)

tickets_with_components = (
    tickets_flat
    .groupBy('key', 'created_at')
    .agg(
        F.first('iam_user_id').alias('iam_user_id'),
        F.first('summary').alias('summary'),
        F.first('description').alias('description'),
        F.collect_set('component_name').alias('component_names')
    )
    .withColumn('sum_description', F.concat(col('summary'), lit('. '), col('description')))
)

In [7]:
cleaned_tickets = (
    tickets_with_components
    .filter(~F.isnull('created_at'))
    .withColumn('clean_text', clean_text_udf(col('sum_description').cast('string')))
    .withColumn('clean_summary', clean_text_udf(col('summary').cast('string')))
    .withColumn('creation_date', F.from_unixtime(col("created_at").cast(T.LongType())))
    .filter(col('creation_date') < datetime.strptime(END_DATE, "%d-%m-%Y"))
    .filter(col('creation_date') > datetime.strptime(START_DATE, "%d-%m-%Y"))
    .select('key', 'iam_user_id', 'creation_date', 'summary', 'description', 'clean_text', 'clean_summary', 'component_names')
    .orderBy('creation_date', ascending=False)
    .cache()
#     .limit(40000)
)

## Preprocessing

In [8]:
data = cleaned_tickets.toPandas()
print(len(data))

21/09/28 17:03:05 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


63380


In [9]:
data.clean_text = data.clean_text.apply(str)

In [10]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(data['clean_text'].values)
sequences = tokenizer.texts_to_sequences(data['clean_text'].values)
X = pad_sequences(sequences, maxlen=maxlen)

In [13]:
import pickle

# saving
with open(PATH_TO_RESULTING_MODEL_DATA + '/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
components_white_list = spark.read.yt(components_white_list_path).toPandas()[['component_names']]
components_white_list.to_csv(PATH_TO_RESULTING_MODEL_DATA + '/components_white_list.csv')
components_white_list = list(components_white_list['component_names'])

In [15]:
components = dict(zip(components_white_list, range(len(components_white_list))))
cleaned_components = []
for i in range(len(data)):
    cleaned_components.append(list(set(data.component_names.iloc[i]) & set(components_white_list)))

In [16]:
def encode_components(component_list):
    res = np.array([components[x] if x in components else np.nan for x in component_list])
    res = res[~np.isnan(res)]
    return res

In [17]:
data['labels'] = list(map(encode_components, cleaned_components))

In [18]:
mlb = MultiLabelBinarizer()
temp = list(data['labels'])
temp.append(list(components.values()))
y = mlb.fit_transform(temp)[:-1, :]

## Training

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, GlobalMaxPool1D, Dense, LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam

In [20]:
model = Sequential()
model.add(Embedding(5000, 64, input_length=maxlen))
model.add(GlobalMaxPool1D())
model.add(Dense(y.shape[1], activation='sigmoid'))
model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

2021-09-28 17:08:53.685186: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-28 17:08:53.687528: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-28 17:08:53.687551: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-28 17:08:53.687588: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-cloud-albina-volk.sas.yp-c.yandex.net): /proc/driver/nvidia/version does not exist
2021-09-28 17:08:53.690935: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set


In [21]:
history = model.fit(
    X, y,
    batch_size=128,
    epochs=10
)

2021-09-28 17:08:55.809820: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 25352000 exceeds 10% of free system memory.
2021-09-28 17:08:55.835000: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 43605440 exceeds 10% of free system memory.
2021-09-28 17:08:55.947639: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-28 17:08:55.955965: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2599995000 Hz


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
model.save(PATH_TO_RESULTING_MODEL_DATA + '/model')

2021-09-28 17:09:48.990197: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../src/support_tickets_classification/data/model/model/assets


2021-09-28 17:09:49,596 - INFO - tensorflow - Assets written to: ../src/support_tickets_classification/data/model/model/assets
