In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import load
from natasha import Segmenter, NewsEmbedding, NewsMorphTagger, MorphVocab, Doc
from nltk.corpus import stopwords

stop_words = set(stopwords.words('russian'))

vectorizer_1st = load('1st_level/tfidf_vectorizer_1st_level.joblib')
model_1st = load('1st_level/stacking_model_1st_level.joblib')
label_encoder_1st = load('1st_level/label_encoder_1st_level.joblib')

vectorizer_2nd = load('2nd_level/tfidf_vectorizer_2nd_level.joblib')
model_2nd = load('2nd_level/random_forest_model_2nd_level.joblib')
label_encoder_2nd = load('2nd_level/label_encoder_2nd_level.joblib')

vectorizer_3rd = load('3rd_level/tfidf_vectorizer_3rd_level.joblib')
model_3rd = load('3rd_level/stacking_model_3rd_level.joblib')
label_encoder_3rd = load('3rd_level/label_encoder_3rd_level.joblib')


In [2]:
!pip freeze

absl-py==2.1.0
accelerate==0.33.0
ace_tools==0.0
annotated-types==0.6.0
appnope==0.1.4
art==6.2
asgiref==3.8.1
asttokens==2.4.1
astunparse==1.6.3
attrs==24.1.0
blinker==1.7.0
blis==0.7.11
catalogue==2.0.10
catboost==1.2.5
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
comm==0.2.2
confection==0.1.4
contourpy==1.2.0
cv==1.0.0
cycler==0.12.1
cymem==2.0.8
DAWG-Python==0.7.2
debugpy==1.8.1
decorator==4.4.2
Django==5.0.4
dm-tree==0.1.8
docopt==0.6.2
eli5==0.13.0
et-xmlfile==1.1.0
executing==2.0.1
ffmpeg==1.4
filelock==3.13.1
Flask==3.0.3
flatbuffers==23.5.26
fonttools==4.49.0
fsspec==2024.2.0
gast==0.5.4
google-pasta==0.2.0
graphviz==0.20.1
grpcio==1.62.0
h5py==3.10.0
huggingface-hub==0.21.4
idna==3.6
image==1.5.33
imageio==2.34.0
imageio-ffmpeg==0.4.9
imbalanced-learn==0.12.3
imblearn==0.0
install==1.3.5
intervaltree==3.1.0
ipykernel==6.29.4
ipymarkup==0.9.0
ipython==8.24.0
itsdangerous==2.1.2
jedi==0.19.1
Jinja2==3.1.3
joblib==1.3.2
jupyter_client==8.6.1
jupy

In [3]:
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.lower()  
    text = ' '.join([word for word in text.split() if word not in stop_words])  
    return text

segmenter = Segmenter()
embedding = NewsEmbedding()
morph_tagger = NewsMorphTagger(embedding)
morph_vocab = MorphVocab()

def lemmatize_text(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return ' '.join([token.lemma for token in doc.tokens])

def preprocess_text(text):
    clean = clean_text(text)
    lemmatized = lemmatize_text(clean)
    return lemmatized

In [4]:
def predict_tags(text):
    preprocessed_text = preprocess_text(text)
    
    X_1st_level = vectorizer_1st.transform([preprocessed_text])
    pred_1st_level = model_1st.predict(X_1st_level)
    tag_1st_level = label_encoder_1st.inverse_transform(pred_1st_level)[0]
    
    text_for_2nd = preprocessed_text + " " + tag_1st_level
    
    X_2nd_level = vectorizer_2nd.transform([text_for_2nd])
    pred_2nd_level = model_2nd.predict(X_2nd_level)
    tag_2nd_level = label_encoder_2nd.inverse_transform(pred_2nd_level)[0]
    
    text_for_3rd = text_for_2nd + " " + tag_2nd_level
    
    X_3rd_level = vectorizer_3rd.transform([text_for_3rd])
    pred_3rd_level = model_3rd.predict(X_3rd_level)
    tag_3rd_level = label_encoder_3rd.inverse_transform(pred_3rd_level)[0]
    
    return {
        "1st Level Tag": tag_1st_level,
        "2nd Level Tag": tag_2nd_level,
        "3rd Level Tag": tag_3rd_level
    }

In [5]:
example_text = "Дно, 6 минутный ролик смотрел минут 15 в качестве 480. Оператор МТС. Ютуб летает в качестве 720, а это ГГ есть желание удалить"
predicted_tags = predict_tags(example_text)
print("Предсказанные теги для введённого текста:")
print(predicted_tags)

Предсказанные теги для введённого текста:
{'1st Level Tag': 'ОТСУТСТВУЕТ', '2nd Level Tag': 'Воспроизведение видео', '3rd Level Tag': 'Тормозит\\Лагает\\Зависает'}
