In [None]:
!pip install pymorphy2

In [None]:
import pandas as pd
import string
import re
import nltk
import pymorphy2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [None]:
pd.set_option('display.max_colwidth', 100)

Методы предобработки

In [None]:
def remove_punctuation(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

def remove_stopwords(tokenised_list):
    stopwords = nltk.corpus.stopwords.words('russian')
    filtered_text = [word for word in tokenised_list if word not in stopwords]
    return filtered_text

def stemming(tokenised_text):
    ps = nltk.SnowballStemmer('russian')
    processed_text = [ps.stem(word) for word in tokenised_text]
    return processed_text

def lemmatizing(tokenized_text):
    ma = pymorphy2.MorphAnalyzer()
    processed_text = [ma.parse(word)[0].normal_form for word in tokenized_text]
    return processed_text

def clean_text(text):
    text = remove_punctuation(text)
    tokens = tokenise(text)
    processed_text = lemmatizing(tokens)
    return processed_text

Тестирование метода random forest

In [None]:
data = pd.read_csv("requests_gpt.csv", encoding='windows-1251', sep = ';')
data.columns = ['label', 'requests']

In [None]:
# TF-IDF для векторизации
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['requests'])

X_features = pd.DataFrame(X_tfidf.toarray())

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size = 0.3)

In [None]:
# вывод характеристик для различных гиперпараметров
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators = n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred)
    print('Est: {} / Depth: {} ----- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, np.round(precision, 3), np.round(recall,3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))
    
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

In [None]:
# работа с лучшим вариантом
rf = RandomForestClassifier(n_estimators = 50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)
print(rf_model.predict(X_test))

