In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import re
import sqlite3
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
russian_stopwords = stopwords.words('russian')

conn = sqlite3.connect('yt_videos_for_train_labeled.db')
videos_df = pd.read_sql('SELECT * FROM videos', conn)

# Предобработка текста
def clean_text(text):
    # Удаление HTML-тегов, специальных символов и цифр
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

# Конкатенация названия видео и описания
videos_df['text'] = videos_df['video_title'].astype(str) + " " + videos_df['video_description'].astype(str)
videos_df['text'] = videos_df['text'].apply(clean_text)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(videos_df['text'], videos_df['profession'], test_size=0.2, random_state=42)

# Создание конвейера с TF-IDF векторизацией и логистической регрессией
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=russian_stopwords)),
    ('clf', LogisticRegression(solver='liblinear'))
])

# Обучение модели
pipeline.fit(X_train, y_train)

# Оценка модели
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yurik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                       precision    recall  f1-score   support

                                                            AI-тренер       0.55      0.49      0.52        57
                                                   Digital-маркетолог       0.51      0.61      0.55        89
                                                 Frontend-разработчик       0.64      0.71      0.67       114
                                                          Автомеханик       0.60      0.87      0.71       127
                                                Агент в сфере туризма       0.70      0.75      0.72       102
                                                              Агроном       0.95      0.96      0.96       106
                                                              Адвокат       0.60      0.73      0.66       100
                                              Администратор ресторана       0.86      0.17      0.28        36
