In [1]:
import warnings

warnings.filterwarnings('ignore')

import re
import string

import numpy as np
import pandas as pd
import pickle

from scipy.sparse import vstack

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize

rs = 100

nltk_sw = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

manual_sw = set([
    'chat',
    'transcript',
    'alexandra',
    'visitor',
    'xsolla',
    'please',
    'thank',
    'hello',
    'ok',
    'hi'
])

In [2]:
match_word_with_digits = re.compile('([A-Za-z]*[\d]+[\w]*|[\d]+[A-Za-z]+[\w]*)')


def preprocess(text, options=()):
    text = text.lower()

    # remove chars has not 31-128 index in ascii table
    text = ''.join([i if 31 < ord(i) < 128 else ' ' for i in text])

    if 'word_with_digits' in options:
        text = match_word_with_digits.sub(r' ', text)

    # remove double spaces and apply lower transformation
    tokens = word_tokenize(text.strip())

    sw = set()

    if 'nltk_stopwords' in options:
        sw = sw.union(nltk_sw)

    if 'manual_stopwords' in options:
        sw = sw.union(manual_sw)

    if len(sw) > 0:
        tokens = [t for t in tokens if not t in sw]

    if 'lemmatization' in options:
        # apply lemmatizer
        tokens = [lemmatizer.lemmatize(t) for t in tokens]

    if 'punctuation' in options:
        # remove punctuation
        tokens = [t for t in tokens if t not in string.punctuation]

    return ' '.join(tokens)


In [4]:
df = pd.read_csv('datasets/3categories_50.csv')

In [5]:
df.dropna(inplace=True)
df.shape


(49999, 3)

In [6]:
df.groupby('category')['channel'].count()


category
afs      35172
other     8443
ps        6384
Name: channel, dtype: int64

In [8]:
%%time


def set_target(x):
    if x == 'afs':
        return 0

    if x == 'other':
        return 1

    return 2

options = (
    'word_with_digits',
    #'nltk_stopwords',
    'manual_stopwords',
    'lemmatization',
    'punctuation'
)

df['cleaned_text'] = df['message'].apply(lambda x: preprocess(x, options))
print(df.shape)
df = df[df['cleaned_text'] != '']
print(df.shape)

y = df['category'].apply(set_target)


(49999, 4)
(49281, 4)
CPU times: user 4min 45s, sys: 7.59 s, total: 4min 52s
Wall time: 5min 31s


In [13]:
%%time

vectorizer = TfidfVectorizer(
    max_features=3000,
    lowercase=False,
    max_df=0.95,
)

X = vectorizer.fit_transform(df['cleaned_text'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)


CPU times: user 7.92 s, sys: 333 ms, total: 8.25 s
Wall time: 8.79 s


In [14]:
clf = LinearSVC(
    C=0.01,
    random_state=rs, 
    max_iter=10000,
    class_weight='balanced',
)

clf.fit(X_train, y_train)

LinearSVC(C=0.01, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=100, tol=0.0001,
          verbose=0)

In [15]:
%%time

# calc statistic
predicted_test = clf.predict(X_test)
predicted_train = clf.predict(X_train)

print('Test', metrics.classification_report(y_test, predicted_test))
print('Train', metrics.classification_report(y_train, predicted_train))

Test               precision    recall  f1-score   support

           0       0.89      0.89      0.89      7006
           1       0.54      0.46      0.50      1630
           2       0.64      0.78      0.70      1221

    accuracy                           0.80      9857
   macro avg       0.69      0.71      0.70      9857
weighted avg       0.80      0.80      0.80      9857

Train               precision    recall  f1-score   support

           0       0.89      0.90      0.90     27972
           1       0.56      0.47      0.52      6423
           2       0.66      0.79      0.72      5029

    accuracy                           0.81     39424
   macro avg       0.71      0.72      0.71     39424
weighted avg       0.81      0.81      0.81     39424

CPU times: user 116 ms, sys: 7.66 ms, total: 123 ms
Wall time: 128 ms


In [16]:
pickle.dump(vectorizer, open("models/multiclf_tfidf.pickle", "wb"))

In [17]:
pickle.dump(clf, open('models/multiclf_model.pickle', 'wb'))

In [18]:
vectorizer_saved = pickle.load(open("./models/multiclf_tfidf.pickle", "rb"))
model_saved = pickle.load(open("./models/multiclf_model.pickle", "rb"))

In [24]:
test_text = ['hello i want to payment but it is blocked']

sample_vec = vectorizer_saved.transform(test_text).toarray()

In [25]:
model_saved.predict(sample_vec)

array([2])