In [15]:
!pip install pymorphy2



In [16]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import pymorphy2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [17]:
pd.set_option('display.max_colwidth', 100)

###Предобработка

In [18]:
def remove_punctuation(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

def remove_stopwords(tokenised_list):
    stopwords = nltk.corpus.stopwords.words('russian')
    filtered_text = [word for word in tokenised_list if word not in stopwords]
    return filtered_text

def stemming(tokenised_text):
    ps = nltk.SnowballStemmer('russian')
    processed_text = [ps.stem(word) for word in tokenised_text]
    return processed_text

def lemmatizing(tokenized_text):
    ma = pymorphy2.MorphAnalyzer()
    processed_text = [ma.parse(word)[0].normal_form for word in tokenized_text]
    return processed_text

In [19]:
def clean_text(text):
    text = remove_punctuation(text)
    tokens = tokenise(text)
    processed_text = lemmatizing(tokens)
    return processed_text

###Загрузка данных

In [20]:
data = pd.read_csv("requests_gpt.csv", encoding='windows-1251', sep = ';')
data.columns = ['label', 'requests']

###TF-IDF для векторизации

In [21]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

X_tfidf = tfidf_vect.fit_transform(data['requests'])
X_features = pd.DataFrame(X_tfidf.toarray())

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size = 0.3)

###GradientBoostingClassifier

Проверка на лучшее сочетание параметров метода GradientBoosting

In [22]:
def test_GB (n_est, depth):
  gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth)
  gb_model = gb.fit(X_train, y_train)
  y_pred = gb_model.predict(X_test)

  precision, recall, fscore, support = score(y_test, y_pred)
  print('Est: {} / Depth: {} ----- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth,
        np.round(precision, 3), np.round(recall,3), round((y_pred==y_test).sum() / len(y_pred), 3)))

In [23]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        test_GB(n_est, depth)

Est: 10 / Depth: 10 ----- Precision: [0.625 0.75  1.   ] / Recall: [1.    0.6   0.714] / Accuracy: 0.765
Est: 10 / Depth: 20 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 10 / Depth: 30 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 10 / Depth: None ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 50 / Depth: 10 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 50 / Depth: 20 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 50 / Depth: 30 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 50 / Depth: None ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 100 / Depth: 10 ----- Precision: [0.714 0.8   1.   ] / Recall: [1.    0.8   0.714] / Accuracy: 0.824
Est: 100 / Depth: 20 ----- Precision: [0.714 0.8  

На момент тестирования алгоритма для имеющегося датасета после est = 10 и depth = 20 улучшений точности алгоритма при увеличении показателей не наблюдается

In [24]:
n_est = 10
depth = 20

In [None]:
gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth)
gb_model = gb.fit(X_train, y_train)
#print(gb_model.predict(X_test))