To-do:
1. Текст был очищен только от одного мусорного элемента в качестве примера. Исслудйте данные через ноутбук или чере веб-интерфейс BigQuery на предмет других мусорных элементов в тексте, которые не несут в себе никакого особого смысла, а только создают шум в данных. Доработайте функцию очистки тектосвых данных, чтобы в нее можно было передать список ненужного мусора и разом выполнялась очистка
2. Проведите стратифицировнную кросс-валидуцию нейросетевого классификатора https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
3. Поэксперементируйте с гиперпараметрами нейросетевого классификатора, постарайтесь повысить качество его работы
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4. Попробуйте использовать не Word2Vec для получения векторого представления текста, а TF-IDF преобразование http://zabaykin.ru/?p=558 http://nlpx.net/archives/57
5. Попробуйте использовать более тонко настриваемые алгоритмы нейросетей, например из этого видео https://www.youtube.com/watch?v=cPkH1k3U1c8 

In [1]:
import warnings
warnings.filterwarnings('ignore')

from google.oauth2 import service_account
import pandas_gbq 

import numpy as np
import pandas as pd
import math as mt
import datetime as dt

from langdetect import detect
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix
from gensim.models.word2vec import Word2Vec
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve,auc
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [3]:
#funtion for getting fresh data from DWH for workload model
"""[summary]
Funtion for getting fresh data from BigQuery for workload scoring model
[description]
Credentials - google service account object with credentials data for project
[example]
Input: Credentials = credentials_object
Output: description	                                        channel	 category	category_flag
        \nChat transcript:\nVisitor: I want to buy wit...	chat	 ps	        1
        \nChat transcript:\nVisitor: hell i had a prob...	chat	 ps	        1
        \nChat transcript:\nVisitor: لا استطيع الشراء ...	 chat	  ps	     1
"""
def getDwhData(Credentials):
    statement_bigquery_sql = " ".join(["select description, channel, case",
                                       "when manual_category in ('payment_problem','how_to_pay','howtopay','how_to_play','paystation_error','ps_problem','ps_declined') then 'ps'",
                                       "else 'other'",
                                       "end as category,",
                                       "case",
                                       "when manual_category in ('payment_problem','how_to_pay','howtopay','how_to_play','paystation_error','ps_problem','ps_declined') then 0",
                                       "else 1",
                                       "end as category_flag",
                                       "from `xsolla_summer_school.customer_support`",
                                       "where manual_category is not null and",
                                       "manual_category <> '' and",
                                       "description is not null and",
                                       "description <> '' and",
                                       "channel is not null and",
                                       "channel <> '' and",
                                       "channel in ('chat','facebook')"])
    
    dataframe_bigquery = pandas_gbq.read_gbq(statement_bigquery_sql,project_id='findcsystem', credentials=Credentials, dialect='standard')

    return dataframe_bigquery


"""[summary]
Function for transform text to lower case
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["Text_1","Text_2"]
Output: ["text_1","text_2"]
"""
def lowerCase(Corpus):
    corpus = [i.lower().replace('\n','') for i in Corpus]
    return corpus


"""[summary]
Function for getting language of text
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["Text_1","Text_2"]
Output: ["en","ru"]
"""
def getTextLanguage(Corpus):
    txt_lang = []
    for txt in Corpus:
        try:
            lang = detect(txt)
            txt_lang.append(lang)
        except:
            lang = 'error'
            txt_lang.append(lang)
    
    return txt_lang


"""[summary]
Function for tokenization text
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["word1 word2","word3 word4"]
Output: [["word1","word2"],["word3","word4"]]
"""  
def textToTokens(Corpus):
    corpus = [i.split() for i in Corpus]
    return corpus 


"""[summary]
Function for clear text after garbage
[description]
Corpus - list or array object, with text data
Substr - string, regular expression
[example]
Input: Corpus = [["word1","word2"],["word3","word4"]]
       Substr = r'word1
Output: [["word2"],["word3","word4"]]
"""  
def clearTextAfterGarbage(Corpus, garbage_words):
    clear_corpus = []
    for text in Corpus:
        indexes = []
        text_len = len(text)
        try:
            for i in range(0,text_len):
                for word in garbage_words:
                    res = re.search(word,text[i])                
                    if res != None:
                        indexes.append(i)
                        break
                
            #delete garbage word from text
            for index in indexes:
                del text[index]
        
            clear_corpus.append(text)
        except:
            clear_corpus.append("error")
        
    return clear_corpus


"""[summary]
Build word vector by using pre-trained Word2Vec model
[description]
Size - lenght of vector
Word2Vec_Model - gensim object
"""  
def buildWordVector(Text,Size,Word2Vec_Model):
    vec = np.zeros(Size).reshape((1,Size))
    count = 0.

    for word in Text:
        try:
            vec += Word2Vec_Model[word].reshape((1,Size))
            count += 1.
        except KeyError:
            continue
    
    if count != 0:
        vec /= count
    
    return vec

RAWDATA 

In [4]:
#getting data from dwh
SupportRawDataframe = getDwhData()
SupportRawDataframe.shape

Downloading: 100%|███████████████████████████████████████████████████████████| 23450/23450 [00:09<00:00, 2363.52rows/s]


(23450, 4)

In [5]:
SupportRawDataframe.head(10)

Unnamed: 0,description,channel,category,category_flag
0,\nChat transcript:\nVisitor: I want to buy wit...,chat,ps,0
1,\nChat transcript:\nVisitor: hell i had a prob...,chat,ps,0
2,\nChat transcript:\nVisitor: لا استطيع الشراء ...,chat,ps,0
3,\nChat transcript:\nVisitor: im having trouble...,chat,ps,0
4,\nChat transcript:\nVisitor: Hi\nAna: Hello. H...,chat,ps,0
5,\nChat transcript:\nVisitor: Здраствуйте\nAna:...,chat,ps,0
6,\nChat transcript:\nVisitor: hello\nVisitor: w...,chat,ps,0
7,\nChat transcript:\nVisitor: not letting me bu...,chat,ps,0
8,\nChat transcript:\nVisitor: مرحبا\nAna: Hello...,chat,ps,0
9,\nChat transcript:\nVisitor: hi im trying to b...,chat,ps,0


In [6]:
SupportRawDataframe.tail(10)

Unnamed: 0,description,channel,category,category_flag
23440,"\nChat transcript:\nVisitor: Hello, I bought p...",chat,other,1
23441,\nChat transcript:\nVisitor: Good day\n\nI acc...,facebook,other,1
23442,\nChat transcript:\nVisitor: payment not going...,chat,other,1
23443,\nChat transcript:\nVisitor: Paid for faceit s...,chat,other,1
23444,"\nChat transcript:\nVisitor: Hello, I am tryin...",chat,other,1
23445,"\nChat transcript:\nVisitor: Hi, i made a pruc...",chat,other,1
23446,"\nChat transcript:\nVisitor: Hi, how long will...",chat,other,1
23447,\nChat transcript:\nVisitor: I bought playerun...,chat,other,1
23448,\nChat transcript:\nVisitor: Good day i took t...,chat,other,1
23449,\nChat transcript:\nVisitor: hi\nVisitor: hell...,chat,other,1


DATA PREPROC

In [7]:
#transform text to lower case
corpus = SupportRawDataframe.description
corpus.astype('str')

corpus = lowerCase(corpus)

#getting language for text corpus
corpus_lang = getTextLanguage(corpus)

In [8]:
#new dataframe with texts in lower case, without /n symbol and with lang for text
SupportRawDataframe['description'] = corpus
SupportRawDataframe['lang'] = corpus_lang

In [9]:
SupportRawDataframe.head(10)

Unnamed: 0,description,channel,category,category_flag,lang
0,chat transcript:visitor: i want to buy with pa...,chat,ps,0,en
1,chat transcript:visitor: hell i had a problem ...,chat,ps,0,en
2,chat transcript:visitor: لا استطيع الشراء ومعل...,chat,ps,0,en
3,chat transcript:visitor: im having trouble wit...,chat,ps,0,en
4,chat transcript:visitor: hiana: hello. how can...,chat,ps,0,en
5,chat transcript:visitor: здраствуйтеana: здрав...,chat,ps,0,ru
6,chat transcript:visitor: hellovisitor: why i c...,chat,ps,0,en
7,chat transcript:visitor: not letting me buy an...,chat,ps,0,en
8,chat transcript:visitor: مرحباana: hello!visit...,chat,ps,0,en
9,chat transcript:visitor: hi im trying to buy v...,chat,ps,0,en


In [10]:
SupportRawDataframe.tail(10)

Unnamed: 0,description,channel,category,category_flag,lang
23440,"chat transcript:visitor: hello, i bought pubg ...",chat,other,1,en
23441,chat transcript:visitor: good dayi accidentall...,facebook,other,1,en
23442,chat transcript:visitor: payment not going thr...,chat,other,1,en
23443,chat transcript:visitor: paid for faceit subsc...,chat,other,1,en
23444,"chat transcript:visitor: hello, i am trying to...",chat,other,1,en
23445,"chat transcript:visitor: hi, i made a pruchase...",chat,other,1,en
23446,"chat transcript:visitor: hi, how long will it ...",chat,other,1,en
23447,chat transcript:visitor: i bought playerunknow...,chat,other,1,en
23448,chat transcript:visitor: good day i took the w...,chat,other,1,en
23449,chat transcript:visitor: hivisitor: hellovisit...,chat,other,1,en


In [11]:
#getting only en texts
SupportDataframe_eng = SupportRawDataframe[SupportRawDataframe.lang == 'en'][:]
SupportDataframe_eng

Unnamed: 0,description,channel,category,category_flag,lang
0,chat transcript:visitor: i want to buy with pa...,chat,ps,0,en
1,chat transcript:visitor: hell i had a problem ...,chat,ps,0,en
2,chat transcript:visitor: لا استطيع الشراء ومعل...,chat,ps,0,en
3,chat transcript:visitor: im having trouble wit...,chat,ps,0,en
4,chat transcript:visitor: hiana: hello. how can...,chat,ps,0,en
...,...,...,...,...,...
23445,"chat transcript:visitor: hi, i made a pruchase...",chat,other,1,en
23446,"chat transcript:visitor: hi, how long will it ...",chat,other,1,en
23447,chat transcript:visitor: i bought playerunknow...,chat,other,1,en
23448,chat transcript:visitor: good day i took the w...,chat,other,1,en


In [12]:
#text tekenization
tokenization = textToTokens(SupportDataframe_eng.description)
SupportDataframe_eng['description'] = tokenization

In [13]:
SupportDataframe_eng

Unnamed: 0,description,channel,category,category_flag,lang
0,"[chat, transcript:visitor:, i, want, to, buy, ...",chat,ps,0,en
1,"[chat, transcript:visitor:, hell, i, had, a, p...",chat,ps,0,en
2,"[chat, transcript:visitor:, لا, استطيع, الشراء...",chat,ps,0,en
3,"[chat, transcript:visitor:, im, having, troubl...",chat,ps,0,en
4,"[chat, transcript:visitor:, hiana:, hello., ho...",chat,ps,0,en
...,...,...,...,...,...
23445,"[chat, transcript:visitor:, hi,, i, made, a, p...",chat,other,1,en
23446,"[chat, transcript:visitor:, hi,, how, long, wi...",chat,other,1,en
23447,"[chat, transcript:visitor:, i, bought, playeru...",chat,other,1,en
23448,"[chat, transcript:visitor:, good, day, i, took...",chat,other,1,en


In [14]:
texts = list(SupportDataframe_eng.description)

In [15]:
#cleaning text after garbage
tests_clear = clearTextAfterGarbage(texts,['chat', 'transcript', 'http'])
tests_clear = clearTextAfterGarbage(texts,['chat', 'transcript', 'http'])

In [16]:
SupportDataframe_eng['description'] = tests_clear

In [17]:
SupportDataframe_eng

Unnamed: 0,description,channel,category,category_flag,lang
0,error,chat,ps,0,en
1,"[i, had, a, problem, yesterday, about, subscri...",chat,ps,0,en
2,"[استطيع, الشراء, ومعلومات, الشراء, صحيحة, ميه,...",chat,ps,0,en
3,"[having, trouble, with, the, new, xsolla, auth...",chat,ps,0,en
4,"[hello., how, can, i, help, you?visitor:, i, c...",chat,ps,0,en
...,...,...,...,...,...
23445,"[i, made, a, pruchase, of, $25, dollars, and, ...",chat,other,1,en
23446,"[how, long, will, it, take, my, friend, to, re...",chat,other,1,en
23447,"[bought, playerunknowns, battlegrounds, and, w...",chat,other,1,en
23448,"[day, i, took, the, wrong, package, i, wanted,...",chat,other,1,en


TRAIN TEST SPLIT DATAFRAMES

In [18]:
#list of unique categories
unique_categories = np.unique(SupportDataframe_eng.category)
unique_categories

array(['other', 'ps'], dtype=object)

In [19]:
descriptions = SupportDataframe_eng['description']
categories = SupportDataframe_eng['category_flag']
XTrain,XTest,YTrain,YTest = train_test_split(descriptions,
                                             categories,
                                             stratify = categories,
                                             test_size = 0.2,
                                             random_state = 40)

FROM TEXTS TO VECTORS

In [20]:
#initialize Word2Vec model for embedding words to vectors
NDim = 100
Imdb_w2v = Word2Vec(size = NDim,min_count = 10)
Imdb_w2v.build_vocab(XTrain)

In [21]:
Imdb_w2v.train(XTrain,total_examples = Imdb_w2v.corpus_count,epochs = Imdb_w2v.epochs)

(7075848, 10653800)

In [22]:
#embedding training messages to vectors for neutral classifier
TrainVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTrain])

In [23]:
TrainVecs

array([[ 0.26713965, -0.02430039,  1.14577978, ..., -0.12226291,
        -0.02320078, -0.53031534],
       [ 0.86498847, -0.39941216,  1.51082796, ..., -0.49444451,
         0.20066925, -0.59581464],
       [-0.00892473,  0.47682804, -0.15390298, ..., -0.09104868,
        -0.03702818, -0.21842433],
       ...,
       [ 0.38941982, -0.50690582,  1.76651444, ..., -0.54050808,
         0.15787388, -0.34732198],
       [ 0.31112572, -0.41205394,  1.55583785, ..., -0.63075522,
        -0.39332119, -0.09802152],
       [ 0.27811198, -0.09237064,  0.96797952, ..., -0.52603677,
         0.07000475, -0.38455505]])

In [24]:
Imdb_w2v.train(XTest, total_examples = Imdb_w2v.corpus_count, epochs = Imdb_w2v.epochs)

(1791416, 2710360)

In [25]:
TestVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTest])

In [26]:
TestVecs

array([[ 0.14708932, -0.41461549,  0.58614922, ..., -0.33346945,
         0.11808695,  0.03208903],
       [ 0.25415888, -0.31821898,  0.97694322, ...,  0.06805607,
        -0.00274074, -0.12002646],
       [ 0.10393134, -0.29185569,  0.77880547, ..., -0.39595893,
        -0.05174484,  0.01350011],
       ...,
       [ 0.22804779, -0.39228853,  0.84869796, ..., -0.3254749 ,
        -0.20782709,  0.14957677],
       [-0.0084122 , -0.47455555,  0.79889392, ..., -0.30080259,
        -0.02279157,  0.07051875],
       [ 0.0780004 , -0.34014294,  0.92080211, ..., -0.2328411 ,
         0.27581512,  0.01908303]])

CROSSVALIDATION AND BUILD CLASSIFIER

In [27]:
TextClassifier = MLPClassifier(hidden_layer_sizes = (20,10), max_iter = 1000, random_state = 40)

In [28]:
skf = StratifiedKFold(n_splits=5)
Scores = cross_val_score(TextClassifier, TrainVecs, YTrain, cv = skf)
print(Scores)
print(np.mean(Scores))

[0.87636505 0.87519501 0.87519501 0.88884555 0.87709715]
0.8785395551600448


In [29]:
TextClassifier.fit(TrainVecs,YTrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=40, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [30]:
pred = TextClassifier.predict(TestVecs)
print(confusion_matrix(YTest,pred))
print(classification_report(YTest,pred))

[[ 463  173]
 [ 560 2009]]
              precision    recall  f1-score   support

           0       0.45      0.73      0.56       636
           1       0.92      0.78      0.85      2569

    accuracy                           0.77      3205
   macro avg       0.69      0.76      0.70      3205
weighted avg       0.83      0.77      0.79      3205



LOOKING AT WRONG CLASSIFIED EXAMPLES

In [31]:
CROSSVALIDATION AND BUILD CLASSIFIERx = pd.DataFrame({'real': YTest, 'pred': pred, 'is_correct': (YTest == pred)})
false_preds_indexes = x[x['is_correct'] == False].reset_index()['index']
false_preds_indexes

0      14198
1      14026
2       4760
3      10414
4      15068
       ...  
728     5937
729     9201
730    10060
731     7666
732    20514
Name: index, Length: 733, dtype: int64

In [32]:
false_classified = SupportDataframe_eng.loc[false_preds_indexes]
false_classified['description']

14198    [결제를, 했고, 카드사로부터, 문자도, 받았는데, 반영이, 안, 되어, 있네요.,...
14026    [just, bought, a, battle, ground, and, i, don'...
4760     [was, unable, to, purchase, escape, from, tark...
10414    [i, had, a, problem, with, my, resubscription,...
15068                                                error
                               ...                        
5937     [i, purchased, the, dauntless, slayer, pack, f...
9201     [there,, i, contacted, you, guys, last, week, ...
10060    [please, confrim, the, paymentapril:, hiapril:...
7666     [want, to, by, 2500gp, i, put, in, all, the, d...
20514    [not, sure, where, to, find, the, product, i, ...
Name: description, Length: 733, dtype: object

FROM TEXTS TO TF-IDF

In [54]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  
TF_IDF_train = tfidf.fit_transform(XTrain)
TF_IDF_test = tfidf.transform(XTest)

BUILD NEW CLASSIFIER WITH TF-IDF

In [63]:
TextClassifier2 = MLPClassifier(hidden_layer_sizes = (20,10), max_iter = 10, random_state = 40)

In [64]:
TextClassifier2.fit(TF_IDF_train,YTrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=10,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=40, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [65]:
pred = TextClassifier2.predict(TF_IDF_test)
print(confusion_matrix(YTest,pred))
print(classification_report(YTest,pred))

[[ 413  223]
 [  85 2484]]
              precision    recall  f1-score   support

           0       0.83      0.65      0.73       636
           1       0.92      0.97      0.94      2569

    accuracy                           0.90      3205
   macro avg       0.87      0.81      0.84      3205
weighted avg       0.90      0.90      0.90      3205

