In [82]:
import subprocess
import re
import numpy as np
import pandas as pd
import tensorflow as tf
pd.set_option('display.max_colwidth', 100)
from datetime import datetime, timedelta
from tqdm import tqdm

import os
import string
os.environ["JAVA_HOME"] = "/usr/local/jdk-11"
# os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "0"

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D
from keras.optimizers import adam_v2


import spyt
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import col, lit, broadcast
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
from clan_tools.data_adapters.YTAdapter import YTAdapter 
import pyspark.sql.dataframe as spd
import warnings
warnings.filterwarnings('ignore')

In [83]:
spark = spyt.connect(spark_conf_args ={
      "spark.executor.memory": "6G",
      "spark.executor.cores": 2,
      "spark.sql.session.timeZone": "UTC",
      "spark.dynamicAllocation.maxExecutors": 6,
      "spark.dynamicAllocation.enabled":True,
      "spark.sql.autoBroadcastJoinThreshold":-1,
      "spark.cores.min":16,
      "spark.driver.memory": "4G",
      "spark.executor.instances":6,
      "spark.jars":'yt:///home/sashbel/graphframes-assembly-0.8.2-SNAPSHOT-spark3.0.jar',
})
# spyt.info(spark)
# Enable Arrow-based columnar data 
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

2021-09-08 15:37:58,427 - INFO - spyt.client - SPYT Cluster version: 3.0.1-1.13.2+yandex
2021-09-08 15:37:58,429 - INFO - spyt.client - SPYT library version: 1.16.0
2021-09-08 15:37:58,484 - INFO - spyt.client - SHS link: http://sas6-1099-node-hahn.sas.yp-c.yandex.net:27001/history/app-20210908150642-0008/jobs/


In [84]:
# f = open("stop_words.txt", "r")
# stop_words = f.read().split("\n")

In [150]:
import pymorphy2
import re
import pyspark.sql.functions as F
import pyspark.sql.types as T

ma = pymorphy2.MorphAnalyzer()


def num_digits(s):
    return sum(c.isdigit() for c in s)

def light_clean_text(text, words_count=100):
    text = text.replace("\\", " ")
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', ' ', text)
    text = re.sub(
        '[.,:;_%©?*,!@#$%^&(){{}}]|[+=]|[«»]|[<>]|[\']|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    return text

def clean_text(text, words_count=100):
    text = text.replace("\\", " ")
#     for word in stop_words:
#         text = text.replace(word, "")
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', ' ', text)
    text = re.sub(
        '[.,:;_%©?*,!@#$%^&(){{}}]|[+=]|[«»]|[<>]|[\']|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    text = ' '.join(word for word in text.split() if len(word) > 2)
    text = ' '.join(word for word in text.split() if not word.isnumeric())
    text = ' '.join(word for word in text.split() if num_digits(word)<=2)
    text = " ".join(ma.parse(word)[0].normal_form for word in text.split())
    words = text.split()[:words_count]
    text = ' '.join(words)
    return text
    

clean_text_udf = F.udf(clean_text, returnType=T.StringType())



def seq_subarray(phrase, keyword):
    if (phrase is None) or (keyword is None):
        return False
    words = phrase.split()
    keywords = keyword.split()
    if (len(words) == 0) or (len(keywords)==0):
        return False
    for i in range(len(words) - len(keywords) + 1):
        res = None
        for j in range(len(keywords)):
            matched_words = (words[i+j] == keywords[j])
            if res is None:
                res = matched_words
            else:
                res = res and matched_words
        if res:
            return True
    return False

seq_subarray_udf = F.udf(seq_subarray, returnType=T.BooleanType())

2021-09-08 15:51:19,539 - INFO - pymorphy2.opencorpora_dict.wrapper - Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts_ru/data
2021-09-08 15:51:19,595 - INFO - pymorphy2.opencorpora_dict.wrapper - format: 2.4, revision: 417127, updated: 2020-10-11T15:05:51.070345


In [86]:
 def Accuracy(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]
    
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])

def Precision(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_true[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_true[i])
    return temp/ y_true.shape[0]

def Recall(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_pred[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_pred[i])
    return temp/ y_true.shape[0]

def F1Measure(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if (sum(y_true[i]) == 0) and (sum(y_pred[i]) == 0):
            continue
        temp+= (2*sum(np.logical_and(y_true[i], y_pred[i])))/ (sum(y_true[i])+sum(y_pred[i]))
    return temp/ y_true.shape[0]

In [87]:
def show_metrics(y_true, y_pred):
    print(f"Exact Match Ratio: {np.all(y_pred == y_true, axis=1).mean()}")
    print(f"Accuracy: {Accuracy(y_true, y_pred)}")
    print(f"Hamming_Loss: {Hamming_Loss(y_true, y_pred)}")
    print(f"Precision: {Precision(y_true, y_pred)}")
    print(f"Recall: {Recall(y_true, y_pred)}")
    print(f"F1Measure: {F1Measure(y_true, y_pred)}")

## Data collecting

In [88]:
support_issues_path="//home/startrek/tables/prod/yandex-team/queue/CLOUDSUPPORT/issues"
tickets_prod_path="//home/cloud/billing/exported-support-tables/tickets_prod"
components_path="//home/startrek/tables/prod/yandex-team/queue/CLOUDSUPPORT/components"
result_path="//home/cloud_analytics/ml/support_tickets_classification_v2/clean_tickets2"
key_words_from_anton = "//home/cloud_analytics/ml/support_tickets_classification/tag_keywords"

In [89]:
issues = (
    spark.read
    .schema_hint({'components': T.ArrayType(T.StringType())})
    .yt(support_issues_path)
    .select('key', F.explode('components').alias('components'))
)

tickets_prod = (
    spark.read
    .yt(tickets_prod_path)
    .select('description', 'summary', 'st_key', 'iam_user_id', 'created_at')
)

components = (
    spark.read.yt(components_path)
    .select('id',
            col('name').alias('component_name'),
            col('shortId').alias('component_short_id'))
)

tickets_flat = (
    tickets_prod
    .join(issues, on=tickets_prod.st_key == issues.key)
    .join(components, on=issues.components == components.id)
)

tickets_with_components = (
    tickets_flat
    .groupBy('key', 'created_at')
    .agg(
        F.first('iam_user_id').alias('iam_user_id'),
        F.first('summary').alias('summary'),
        F.first('description').alias('description'),
        F.collect_set('component_name').alias('component_names')
    )
    .withColumn('sum_description', F.concat(col('summary'), lit('. '), col('description')))
)

In [90]:
cleaned_tickets = (
    tickets_with_components
    .filter(~F.isnull('created_at'))
    .withColumn('clean_text', clean_text_udf(col('sum_description').cast('string')))
    .withColumn('clean_summary', clean_text_udf(col('summary').cast('string')))
    .withColumn('creation_date', F.from_unixtime(col("created_at").cast(T.LongType())))
    .filter(col('creation_date') < datetime.strptime("01-07-2021", "%d-%m-%Y"))
    .filter(col('creation_date') > datetime.strptime("01-01-2020", "%d-%m-%Y"))
    .select('key', 'iam_user_id', 'creation_date', 'summary', 'description', 'clean_text', 'clean_summary', 'component_names')
    .orderBy('creation_date', ascending=False)
    .cache()
#     .limit(20000)
)

In [91]:
# cleaned_tickets.write.yt(result_path, mode='overwrite')

## Preprocessing

In [282]:
def is_english(text):
    a = list(map(lambda x: x in (string.ascii_lowercase + string.ascii_uppercase), text))
    return sum(a)/len(a) >= 0.5

In [283]:
data = cleaned_tickets.toPandas()

In [284]:
data.clean_text = data.clean_text.apply(str)
data['text'] = data['summary'] + ' ' + data['description']
# data = data[~data['text'].apply(is_english)]

In [285]:
maxlen=100

In [286]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(data['clean_text'])
sequences = tokenizer.texts_to_sequences(data['clean_text'])
X = pad_sequences(sequences, maxlen=maxlen)

In [287]:
# components = spark.read.yt(key_words_from_anton).select('tag').cache().toPandas().values
# components = dict(zip(components.ravel(), range(len(components))))
components = {}
count = 0
new_comp_names = []
for i in range(len(data)):
    for x in data.component_names.iloc[i]:
        if x not in components:
            components[x] = count
            count += 1    

In [288]:
def encode_components(component_list):
    res = np.array([components[x] if x in components else np.nan for x in component_list])
    res = res[~np.isnan(res)]
    return res

In [289]:
data['labels'] = data['component_names'].apply(lambda a: encode_components(a))

In [290]:
mlb = MultiLabelBinarizer()
temp = list(data['labels'])
temp.append(list(components.values()))
y = mlb.fit_transform(temp)[:-1, :]

In [291]:
data_train, data_val, x_train, x_val, y_train, y_val = train_test_split(data['text'], X, y, test_size=0.1, random_state=42)
data_train, data_test, x_train, x_test, y_train, y_test = train_test_split(data_train, x_train, y_train, test_size=0.1, random_state=42) # , random_state=42

## Training

In [292]:
model = Sequential()
model.add(Embedding(5000, 64, input_length=maxlen))
model.add(GlobalMaxPool1D())
model.add(Dense(y.shape[1], activation='sigmoid'))
model.compile(optimizer=adam_v2.Adam(0.015), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [293]:
history = model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=10,
    validation_data=(x_val, y_val),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [294]:
y_pred = model.predict(x_val)

In [295]:
for threshold in tqdm(np.arange(0, 1, 0.05)):
    if Precision(y_val, y_pred > threshold) < 0.7:
        break
threshold

 30%|███       | 6/20 [00:13<00:31,  2.22s/it]


0.30000000000000004

In [296]:
y_pred = model.predict(x_val) > threshold
scores = model.predict(x_val)

In [297]:
print("Validation metrics")
show_metrics(y_val, y_pred)

Validation metrics
Exact Match Ratio: 0.3691004437273094
Accuracy: 0.5621823315853172
Hamming_Loss: 0.01111584500818108
Precision: 0.6944029850746268
Recall: 0.6304692752453956
F1Measure: 0.6295224328806339


In [298]:
print("Test metrics")
show_metrics(y_test, model.predict(x_test) > threshold)

Test metrics
Exact Match Ratio: 0.3742716270730614
Accuracy: 0.5677386822052891
Hamming_Loss: 0.01099924959331987
Precision: 0.7114186463469303
Recall: 0.6270618556701034
F1Measure: 0.635776197755126


In [299]:
comps = np.array(list(components.keys()))
best_preds = [comps[y_pred[i]] for i in range(y_pred.shape[0])]
best_scores = [scores[i][y_pred[i]] for i in range(y_pred.shape[0])]

In [300]:
df = pd.DataFrame()
df['text'] = data_val.apply(light_clean_text)
df['best_prediction'] = best_preds
df['prediction_score'] = best_scores 
df['true'] = data['component_names'][data_val.index]

In [301]:
df

Unnamed: 0,text,best_prediction,prediction_score,true
25604,переключение мастера между нодами redis кластер c9q9osgoht2tec0t980d\nвчера было два переключени...,[mdb.redis],[0.97871053],[mdb.redis]
29664,привязка карты ребзи добрадня не можем подвязать карту почему то что может быть,[billing.cards],[0.8275183],[billing.cards]
42233,не проходят запросы на https api telegram org здравствуйте вводная на митапе в октябре узн...,[],[],[vpc]
147,аккаунт заблокировал пришло сообщение что аккаунт заблокировали что это значит почему заблоки...,"[billing.trial, billing.antifraud]","[0.42928618, 0.62323695]",[billing]
20338,object storage сервис добрый день в хранилище object storage висят незавершённые загрузки кото...,[s3],[0.9976779],[s3]
...,...,...,...,...
3077,raise quota for b1gakusj0mfb4288jgdo compute cloud \n number of gpus compute instancegpus...,[квоты],[0.99986076],"[квоты, compute]"
1107,error 2 unknown error cluster id c9qg3aj2mg59vot5u0ht\nпри удалении топика возникла ошибка кл...,[mdb.kafka],[0.8692708],[mdb.kafka]
29618,обновление security groups зависло доброго времени суток \n2 sg повисли в статусе updating enp...,[],[],[vpc.security groups]
17561,поднять квоты для b1gbok6vibrvvm5fmpdu managed databases \n количество vcpu mdb cpu count...,"[квоты, mdb]","[0.9999039, 0.42406026]",[квоты]


In [295]:
# spark.createDataFrame(df).coalesce(1).write.mode('overwrite').yt('//home/cloud_analytics/ml/support_tickets_classification_v2/test', mode='overwrite')

In [303]:
df[df.best_prediction.apply(len) == 0]

Unnamed: 0,text,best_prediction,prediction_score,true
42233,не проходят запросы на https api telegram org здравствуйте вводная на митапе в октябре узн...,[],[],[vpc]
32962,не работает облако здравствуйте у облака b1gejqn23i5lpmv0uv3m стоит активный но оно не досту...,[],[],[iam]
34321,объем моего облока какой объем моего обложка,[],[],"[квоты, compute]"
10999,dataproc 2 0 ошибка при подключении к tez ui при открытии tez ui для hive сессии выдается сооб...,[],[],[dataproc]
30479,выпуск фикса для тераформ провайдера 0 44 добрый день просьба проинформировать о выходе фикса д...,[],[],[s3]
...,...,...,...,...
13747,эккаунт оплачен но не доступен здравствуйте мы не можем понять в одном месте эккаунта платежн...,[],[],[billing]
29517,не соответствие гранта и баланса здравствуйте я обучаюсь в яндекс практикуме и мне выдали грант...,[],[],[billing.trial]
16529,логи и графики в data transfer добрый день ид трансфера dttbfb90u85frvdrue2p заметил еще вче...,[],[],[data-transfer]
35145,кластер бд перешел в режим с дефолтными настройками sql mode просьба пояснить проверить по лога...,[],[],[Линия 2]


In [307]:
df[df.apply(lambda row: len(set(row['best_prediction']) ^ set(row['true'])), axis=1) >= 3]

Unnamed: 0,text,best_prediction,prediction_score,true
147,аккаунт заблокировал пришло сообщение что аккаунт заблокировали что это значит почему заблоки...,"[billing.trial, billing.antifraud]","[0.42928618, 0.62323695]",[billing]
5865,проблемы с репликацией sql сервера добрый день коллеги \n18 05 2021 примерно в 13 00 возникла п...,[mdb.postgresql],[0.30953494],"[compute.nbs, compute]"
34957,физический хостинг сервера добрый день \nу вас есть услуга физического размещения сервера на ваш...,"[business, compute.vm]","[0.43538013, 0.38163733]",[other]
2432,практикум прошу разблокировать аккаунт для прохождения практикума по yandex cloud,"[billing.trial, billing.antifraud]","[0.6039027, 0.46206155]","[квоты, вернули]"
20916,ssh при попытке соединения по ssh часто встречаются такие случаи \n connection to 178 154 232 2...,"[compute.vm, compute.nbs]","[0.43364912, 0.44330454]",[compute.ssh]
...,...,...,...,...
47134,переход на платную версию добры вечер отвязал карту от платежного аккаунта и пробный период зав...,"[billing, billing.trial]","[0.47432628, 0.50881934]",[billing.antifraud]
16887,квота с gpu нужна квота для виртуальной машины с gpu у меня пробный период это возможно,"[billing.trial, квоты, billing.antifraud, compute.GPU]","[0.3485219, 0.8004202, 0.32868415, 0.8434789]","[квоты, compute.GPU, вернули]"
46742,статус вм заблокирован добрый день\nустановлен статус заблокирован\nуточните с чем это связано ...,"[billing, billing.trial, billing.antifraud]","[0.45238936, 0.45213693, 0.31806862]","[docs, billing.trial]"
16423,usergate добрый день я установил usergate из маркетплейса как к нему подключится по ip 8001 ut...,"[compute.windows, compute.vm]","[0.36117688, 0.4529421]",[marketplace]


21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_158 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_65 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_110 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_123 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_51 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_55 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_19 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_174 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_69 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_181 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No m

21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_190 !
21/09/08 16:38:08 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_41_194 !
