In [365]:
import warnings

warnings.filterwarnings('ignore')

import re
import string
import itertools

import numpy as np
import pandas as pd

from scipy.sparse import vstack

import matplotlib.pyplot as plt

from tqdm import tqdm

from langdetect import detect

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD

from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize

rs = 100

nltk_sw = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

manual_sw = set([
    'chat',
    'transcript',
    'alexandra',
    'visitor',
    'xsolla',
    'please',
    'thank',
    'hello',
    'ok',
    'hi'
])


In [2]:
# data = pd.read_csv('datasets/data3.csv').drop(columns='Unnamed: 0')
# data = data.sample(150)

In [3]:
%%time

def filter_text(x):
    x = x[:1000]
    
    try:
        res = detect(x) == 'en'
    except:
        return False
    
    return res

# df = data[data['description'].apply(filter_text)]
# df.shape

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 13.1 µs


In [271]:
match_word_with_digits = re.compile('([A-Za-z]*[\d]+[\w]*|[\d]+[A-Za-z]+[\w]*)')

def preprocess(text, options=()):
    text = text.lower()
    
    # remove chars has not 31-128 index in ascii table
    text = ''.join([i if 31 < ord(i) < 128 else ' ' for i in text])
    
    if 'word_with_digits' in options:
        text = match_word_with_digits.sub(r' ', text)
    
    # remove double spaces and apply lower transformation
    tokens = word_tokenize(text.strip())
    
    sw = set()
    
    if 'nltk_stopwords' in options:
        sw = sw.union(nltk_sw)
    
    if 'manual_stopwords' in options:
        sw = sw.union(manual_sw)
        
    if len(sw) > 0:
        tokens = [t for t in tokens if not t in sw]
        
    if 'lemmatization' in options:
        # apply lemmatizer
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
    if 'punctuation' in options:
        # remove punctuation
        tokens = [t for t in tokens if t not in string.punctuation]

    return ' '.join(tokens)

In [9]:
def get_model(model_name):
    if model_name == 'svm':
        return LinearSVC(
            C=0.5,
            random_state=rs, 
            max_iter=10000,
        )
    
    if model_name == 'gb':
        return GradientBoostingClassifier(
            random_state=rs
        )
    
    if model_name == 'ann':
        return MLPClassifier(
            hidden_layer_sizes=(100),
            validation_fraction=0.2,
            learning_rate_init=0.0001,
            early_stopping=True,
            random_state=rs,
            alpha=0.01
        )

In [278]:
class Corpus:
    def __init__(self, sentences):
        self.sentences = sentences
        
    def __iter__(self):            
        for sent in self.sentences:
            yield sent.split()
            
def avg_pool(text, model):
    
    tokens = np.array(text.split())
    tokens = [t for t in tokens if t in model]
    
    if len(tokens) == 0:
        # TODO remove
        return np.zeros(model['paypal'].shape)
        
    return model[tokens].sum(axis=0) / len(tokens)

def get_text_repr(text_repr_name, X):
    
    if text_repr_name in ['word2vec', 'fasttext']:
        params = {
            'size': 50,
            'seed': rs,
            'sentences': Corpus(X),
            'min_count': 10,
            'negative': 10,
            'sg': 0
        }
        
        model_class = Word2Vec if text_repr_name == 'word2vec' else FastText
        model_obj = model_class(**params)    
        
        return np.array([avg_pool(text, model_obj) for text in X])
    
    if text_repr_name in ['tfidf', 'bow']:
        params = {
            'lowercase': False,
            'max_df': 0.95
        }
        
        model_class = CountVectorizer if text_repr_name == 'bow' else TfidfVectorizer
        model_obj = model_class(**params)
        
        return model_obj.fit_transform(X)

In [11]:
all_preprocess_options = [
    'nltk_stopwords',
    'manual_stopwords',
    'lemmatization',
    'punctuation'
]

text_repr_options = [
    'word2vec',
    'fasttext',
    'tfidf',
    'bow'
]

model_options = [
    'svm',
    'gb',
    'ann'
]

preprocess_options = []

for i in range(len(all_preprocess_options)):
    options = list(itertools.combinations(all_preprocess_options, i + 1))
    preprocess_options = preprocess_options + options
    
training_options_list = list(itertools.product(
    preprocess_options, 
    text_repr_options, 
    model_options
))

training_options_list = [op for op in training_options_list if not (op[1] in ['tfidf', 'bow'] and op[2] in ['ann', 'gb'])]
len(training_options_list)

120

In [12]:
# about 3 hours!

def run_experiment():
    last_preprocess = None
    last_text_repr = None

    scores = [
        metrics.precision_score,
        metrics.accuracy_score,
        metrics.roc_auc_score,
        metrics.recall_score,
        metrics.f1_score
    ]

    df_experiment = pd.DataFrame()

    for training_options in tqdm(training_options_list):
        cur_preprocess = training_options[0]
        cur_text_repr = training_options[1]
        model_name = training_options[2]

        if cur_preprocess != last_preprocess:
            df['cleaned_text'] = df['description'].apply(lambda x: preprocess(x, cur_preprocess))
            y = 1 - df['category_flag'].to_numpy()

        if cur_text_repr != last_text_repr:
            X_vector = get_text_repr(cur_text_repr, df['cleaned_text'])


        X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state=rs)

        clf = get_model(model_name)
        clf.fit(X_train, y_train)

        # calc statistic
        acc_scores = cross_val_score(clf, X_train, y_train, cv=5)
        acc_scores = np.mean(acc_scores)

        predicted_test = clf.predict(X_test)
        predicted_train = clf.predict(X_train)

        calc_score = lambda score, true, predicted: score(true, predicted)

        test_scores = {f'test_{s.__name__}': calc_score(s, y_test, predicted_test) for s in scores}
        train_scores = {f'train_{s.__name__}': calc_score(s, y_train, predicted_train) for s in scores}

        # add to dataframe
        df_experiment = df_experiment.append({
            'prepoccess': cur_preprocess,
            'text_repr': cur_text_repr,
            'model': model_name,
            'cv_accuracy': acc_scores,
            **test_scores,
            **train_scores
        }, ignore_index=True)

        last_preprocess = cur_preprocess
        last_text_repr = cur_text_repr
    
    df_experiment.to_csv('datasets/experiment.csv')

100%|██████████| 120/120 [3:17:43<00:00, 98.86s/it]  


In [125]:
filter_ = (df_experiment['model'] == 'svm') & (df_experiment['text_repr'] == 'tfidf')

# df_experiment.sort_values(by='cv_accuracy', ascending=False).head(50)

df_experiment[filter_].sort_values(by='test_accuracy_score', ascending=False).head(50)

Unnamed: 0,cv_accuracy,model,prepoccess,test_accuracy_score,test_f1_score,test_precision_score,test_recall_score,test_roc_auc_score,text_repr,train_accuracy_score,train_f1_score,train_precision_score,train_recall_score,train_roc_auc_score
14,0.921569,svm,"(manual_stopwords,)",0.908031,0.760656,0.814035,0.713846,0.835933,tfidf,0.96606,0.910674,0.946167,0.877747,0.932743
70,0.921569,svm,"(manual_stopwords, punctuation)",0.908031,0.760656,0.814035,0.713846,0.835933,tfidf,0.96606,0.910674,0.946167,0.877747,0.932743
30,0.921647,svm,"(punctuation,)",0.907717,0.760033,0.812609,0.713846,0.835735,tfidf,0.96606,0.910674,0.946167,0.877747,0.932743
22,0.921254,svm,"(lemmatization,)",0.907402,0.759411,0.811189,0.713846,0.835537,tfidf,0.965667,0.909656,0.944899,0.876948,0.932197
78,0.921254,svm,"(lemmatization, punctuation)",0.907402,0.759411,0.811189,0.713846,0.835537,tfidf,0.965667,0.909656,0.944899,0.876948,0.932197
62,0.921175,svm,"(manual_stopwords, lemmatization)",0.907087,0.758395,0.810858,0.712308,0.834768,tfidf,0.965667,0.909619,0.945282,0.876548,0.932046
110,0.921175,svm,"(manual_stopwords, lemmatization, punctuation)",0.907087,0.758395,0.810858,0.712308,0.834768,tfidf,0.965667,0.909619,0.945282,0.876548,0.932046
86,0.920624,svm,"(nltk_stopwords, manual_stopwords, lemmatization)",0.906772,0.757774,0.809441,0.712308,0.83457,tfidf,0.96669,0.912513,0.945969,0.881342,0.934492
118,0.920624,svm,"(nltk_stopwords, manual_stopwords, lemmatizati...",0.906772,0.757774,0.809441,0.712308,0.83457,tfidf,0.96669,0.912513,0.945969,0.881342,0.934492
6,0.921805,svm,"(nltk_stopwords,)",0.906457,0.756757,0.809107,0.710769,0.8338,tfidf,0.966218,0.911309,0.944302,0.880543,0.933897


In [14]:
df_experiment.groupby(['model', 'text_repr'])[[
    'cv_accuracy', 
    'test_f1_score', 
    'test_roc_auc_score', 
    'train_f1_score',
    'train_roc_auc_score'
]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,cv_accuracy,test_f1_score,test_roc_auc_score,train_f1_score,train_roc_auc_score
model,text_repr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ann,fasttext,0.889467,0.669805,0.781514,0.717429,0.807849
ann,word2vec,0.896443,0.68915,0.792081,0.723498,0.811164
gb,fasttext,0.886154,0.645138,0.760428,0.741667,0.815228
gb,word2vec,0.890606,0.669958,0.774481,0.755918,0.824406
svm,bow,0.898963,0.720603,0.821418,0.989191,0.990254
svm,fasttext,0.886301,0.652032,0.768731,0.685414,0.786992
svm,tfidf,0.921311,0.757985,0.834548,0.910972,0.933292
svm,word2vec,0.892049,0.673883,0.780783,0.703768,0.798527


In [409]:
df = pd.read_csv('datasets/data3_english.csv').drop(columns='Unnamed: 0')

In [410]:
%%time

options = (
    'word_with_digits',
#     'nltk_stopwords',
    'manual_stopwords',
    'lemmatization',
    'punctuation'
)

df['cleaned_text'] = df['description'].apply(lambda x: preprocess(x, options))
print(df.shape)
df = df[df['cleaned_text'] != '']
print(df.shape)

y = 1 - df['category_flag'].to_numpy()

(15874, 5)
(15798, 5)
CPU times: user 1min 46s, sys: 1.36 s, total: 1min 47s
Wall time: 1min 54s


In [411]:
%%time

vectorizer = TfidfVectorizer(
    lowercase=False,
    max_df=0.95,
)

X = vectorizer.fit_transform(df['cleaned_text'])

# svd = TruncatedSVD(n_components=100, random_state=rs)
# X = svd.fit_transform(X)

# X_w2v = get_text_repr('word2vec', df['cleaned_text'])

# X = np.concatenate([X, X_w2v], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

CPU times: user 3.78 s, sys: 165 ms, total: 3.94 s
Wall time: 5.03 s


In [412]:
clf = LinearSVC(
    C=0.01,
    random_state=rs, 
    max_iter=10000,
    class_weight='balanced',
)

clf.fit(X_train, y_train)

LinearSVC(C=0.01, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=100, tol=0.0001,
          verbose=0)

In [413]:
%%time

# calc statistic
predicted_test = clf.predict(X_test)
predicted_train = clf.predict(X_train)

print(metrics.classification_report(y_test, predicted_test))
print(metrics.classification_report(y_train, predicted_train))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92      2514
           1       0.64      0.89      0.74       646

    accuracy                           0.88      3160
   macro avg       0.80      0.88      0.83      3160
weighted avg       0.90      0.88      0.88      3160

              precision    recall  f1-score   support

           0       0.97      0.88      0.92     10161
           1       0.64      0.90      0.75      2477

    accuracy                           0.88     12638
   macro avg       0.81      0.89      0.84     12638
weighted avg       0.91      0.88      0.89     12638

CPU times: user 60.3 ms, sys: 3.61 ms, total: 63.9 ms
Wall time: 119 ms


In [414]:
overfit_size_test = X_test.shape[0]
overfit_size_train = X_train.shape[0]

prob = overfit_size_test / overfit_size_train

random_sample = np.random.choice(
    [True, False], 
    size=overfit_size_train, 
    p=[prob, 1 - prob]
)

X_overfit = X_train[random_sample]

overfit_size_train = X_overfit.shape[0]

X_overfit = vstack((X_overfit, X_test), format='csr')
y_overfit = np.concatenate([np.zeros(overfit_size_train), np.ones(overfit_size_test)])

predicted_overfit = clf.predict(X_overfit)
print(metrics.classification_report(y_overfit, predicted_overfit))
print(metrics.confusion_matrix(y_overfit, predicted_overfit))

              precision    recall  f1-score   support

         0.0       0.50      0.73      0.59      3112
         1.0       0.51      0.28      0.36      3160

    accuracy                           0.50      6272
   macro avg       0.51      0.51      0.48      6272
weighted avg       0.51      0.50      0.48      6272

[[2267  845]
 [2267  893]]


In [353]:
def get_key_by_value(d, v):
    return list(d.keys())[list(d.values()).index(v)]

def print_n_top(vocab, values, n=10, ascending=False):
    indexes = values.argsort()
    
    if ascending:
        indexes = indexes[::-1]
    
    for i in range(n):
        idx = indexes[i]
        print(get_key_by_value(vocab, idx), round(values[idx], 2))
    

In [354]:
print_n_top(vectorizer.vocabulary_, vectorizer.idf_, n=20)

payment 1.26
day 1.54
help 1.63
contact 1.65
issue 1.68
account 1.74
transaction 1.75
email 1.77
wait 1.83
provide 1.83
refund 1.83
take 1.84
number 1.88
game 1.93
check 1.95
may 1.96
information 1.98
patience 1.99
purchase 2.0
happy 2.0


In [373]:
print_n_top(vectorizer.vocabulary_, clf.coef_[0], n=20, ascending=True)

card 1.02
error 0.84
try 0.77
bank 0.64
paypal 0.61
page 0.6
code 0.57
method 0.53
pay 0.52
browser 0.51
option 0.49
unfortunately 0.46
amazon 0.45
clear 0.45
retry 0.44
trying 0.42
another 0.42
buy 0.4
pop 0.39
choose 0.39
