# Этика бизнеса. Классификация

Смотрим на данные.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

train_data = pd.read_csv("train_bank.csv", index_col=[0])

In [14]:
train_data.shape

(19361, 4)

In [32]:
train_data

Unnamed: 0,sentence,1category,2category,sentiment
4754,При этом всегда получал качественные услуги.,Communication,,+
4417,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−
3629,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−
11640,Отвратительное отношение к клиентам.,Communication,,−
5571,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+
...,...,...,...,...
8004,Никогда и ни в коем случае не открывайте счет ...,Communication,,−
18182,ТИ откровенно забили на качество и развивают с...,Quality,,−
744,"Я считаю, это прорыв и лидерство финансовых ус...",?,,+
6220,"Писал мужчина очень доходчиво, не финансовым я...",Communication,,+


Предобработка:

In [7]:
corpus = list(train_data.sentence)

In [8]:
corpus_lemmatized = []

In [9]:
# delete punctuation
import string
from pymorphy2 import MorphAnalyzer

def string_preparation(text):
    translating = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translating)
    # Lemmatize words in the text
    morph = MorphAnalyzer()
    lemmas = [morph.parse(word)[0].normal_form for word in text_without_punctuation.split()]
    lemmatized_text = ' '.join(lemmas)

    return lemmatized_text

In [13]:
for text in corpus:
    corpus_lemmatized.append(string_preparation(text))

In [24]:
len(corpus_lemmatized)

19361

In [21]:
# with open("corpus_lemmatized.txt", "w") as text_file:
#     for line in corpus_lemmatized:
#         text_file.write(f"{line}\n")

In [75]:
print(train_data.sentence[4754],'\n')
print(corpus_lemmatized[0])

При этом всегда получал качественные услуги. 

при это всегда получать качественный услуга


In [90]:
# with open("corpus_lemmatized.txt", "r") as text_file:
#     lines = text_file.readlines()
#     for line in lines:
#         print(line)
#         break

In [64]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_data.sentiment)

Обучение модели:

In [77]:
# БЕЗ ЛЕММАТИЗАЦИИ:
X = train_data.sentence
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, \
                                                    stratify=y_encoded)
vectorizer = TfidfVectorizer(max_features = 1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
clf = RandomForestClassifier().fit(X_train_vectorized, y_train)
y_prob = clf.predict_proba(X_test_vectorized)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9289697513469962

In [79]:
X = train_data.sentence
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, \
                                                   stratify=y_encoded)
vectorizer = CountVectorizer(max_features = 1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test) 
clf = RandomForestClassifier().fit(X_train_vectorized, y_train)
y_prob = clf.predict_proba(X_test_vectorized)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9293175038630492

In [80]:
# С ЛЕММАТИЗАЦИЕЙ
X = corpus_lemmatized
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, \
                                                    stratify=y_encoded)
vectorizer = TfidfVectorizer(max_features = 1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test) 
clf = RandomForestClassifier().fit(X_train_vectorized, y_train)
y_prob = clf.predict_proba(X_test_vectorized)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9314072548486864

In [81]:
X = corpus_lemmatized
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, \
                                                    stratify=y_encoded)
vectorizer = CountVectorizer(max_features = 1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test) 
clf = RandomForestClassifier().fit(X_train_vectorized, y_train)
y_prob = clf.predict_proba(X_test_vectorized)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9332171780918238

Применение на тестовых данных:

In [86]:
test_data = pd.read_csv("test_for_participants.csv", index_col=[0])
test_data

Unnamed: 0,sentence
0,"Очень неприятная ситуация, надеюсь, банк либо ..."
1,За что выражаю благодарность и банку и данному...
2,"Вывод: информация полученная в смс от банка, и..."
3,Хочу по благодарить ее за чуткое отношение к н...
4,"Показал, что я и вклад могу свой пополнять пря..."
...,...
2147,Верная (по их мнению) ставка 13%.
2148,Спасибо Промсвязьбанку за гибкий и человечески...
2149,"Это говорит о том, что обслуживание находится ..."
2150,Без платежки ничего не принимают!


In [87]:
X = corpus_lemmatized
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, \
                                                    stratify=y_encoded)
vectorizer = CountVectorizer(max_features = 1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test) 
best_model = RandomForestClassifier().fit(X_train_vectorized, y_train)
y_prob = best_model.predict_proba(X_test_vectorized)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9298115313201313

In [None]:
# preprocess test data
corpus_test = list(test_data.sentence)
X_test_prep = []
for text in corpus_test:
    X_test_prep.append(string_preparation(text))
X_test_prep

In [89]:
with open("test_lemmatized.txt", "w") as text_file:
    for line in X_test_prep:
        text_file.write(f"{line}\n")

In [None]:
X_test_prep_vectorized = vectorizer.fit_transform(X_test_prep)
y_prob = best_model.predict_proba(X_test_prep_vectorized)
y_prob