In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import re
import sqlite3
import nltk
from nltk.corpus import stopwords
import pickle
from sklearn.multiclass import OneVsRestClassifier

nltk.download('stopwords')
russian_stopwords = stopwords.words('russian')

conn = sqlite3.connect('yt_videos_for_train_labeled.db')
videos_df = pd.read_sql('SELECT * FROM videos', conn)

# Предобработка текста
def clean_text(text):
    # Удаление HTML-тегов, специальных символов и цифр
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

# Конкатенация названия видео и описания
videos_df['text'] = videos_df['video_title'].astype(str) + " " + videos_df['video_description'].astype(str)
videos_df['text'] = videos_df['text'].apply(clean_text)

X = videos_df['text']
y = videos_df['profession']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yurik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Создание конвейера с TF-IDF векторизацией и OneVsRestClassifier с логистической регрессией
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=russian_stopwords)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# Предполагаем, что 'y_train' и 'y_test' теперь имеют мультилейбловый формат
pipeline.fit(X_train, y_train)

# Оценка модели
predictions = pipeline.predict(X_test)

# save the model to disk
filename = 'text_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

print(classification_report(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                       precision    recall  f1-score   support

                                                            AI-тренер       0.55      0.49      0.52        57
                                                   Digital-маркетолог       0.51      0.61      0.55        89
                                                 Frontend-разработчик       0.64      0.71      0.67       114
                                                          Автомеханик       0.60      0.87      0.71       127
                                                Агент в сфере туризма       0.70      0.75      0.72       102
                                                              Агроном       0.95      0.96      0.96       106
                                                              Адвокат       0.60      0.73      0.66       100
                                              Администратор ресторана       0.86      0.17      0.28        36


In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Предобработка данных и векторизация
tfidf_vectorizer = TfidfVectorizer(stop_words=russian_stopwords)
X_tfidf = tfidf_vectorizer.fit_transform(X)
y_mlb = MultiLabelBinarizer().fit_transform(y)

# Разбиение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_mlb, test_size=0.2, random_state=42)

def sparse_to_dense_generator(X, y, batch_size):
    num_samples = X.shape[0]
    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        X_batch = X[start_idx:end_idx].toarray()  # Преобразование в плотный формат
        y_batch = y[start_idx:end_idx]
        yield X_batch, y_batch

# Использование генератора для создания объектов tf.data.Dataset
batch_size = 32
train_dataset = tf.data.Dataset.from_generator(
    lambda: sparse_to_dense_generator(X_train, y_train, batch_size),
    output_types=(tf.float32, tf.int32),
    output_shapes=([None, X_train.shape[1]], [None, y_train.shape[1]])
).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_generator(
    lambda: sparse_to_dense_generator(X_test, y_test, batch_size),
    output_types=(tf.float32, tf.int32),
    output_shapes=([None, X_test.shape[1]], [None, y_test.shape[1]])
).prefetch(tf.data.AUTOTUNE)

# Создание модели нейронной сети
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

# Компиляция модели
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])


# Обучение модели
model.fit(train_dataset, epochs=10, validation_data=test_dataset)

# Оценка модели на тестовых данных
score = model.evaluate(test_dataset)

# Вывод результатов оценки
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Num GPUs Available:  0
Epoch 1/10
      1/Unknown - 1s 882ms/step - loss: 0.6930 - accuracy: 0.0625

KeyboardInterrupt: 

: 

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Разбиение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Векторизация данных с помощью TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=russian_stopwords)  # Используйте 'english' для английского текста
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Преобразование разреженных матриц в плотные
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Создание и обучение модели LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_dense, y_train)

# Прогнозирование на тестовом наборе
y_pred = lda.predict(X_test_dense)

# Оценка точности
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

MemoryError: Unable to allocate 486. MiB for an array with shape (772, 82443) and data type float64