<a href="https://colab.research.google.com/github/CodeHunterOfficial/ABC_DataMining/blob/main/SentimentTrendAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Исходные данные
data = {
    "video_id": [
        "vid1", "vid1", "vid2", "vid2", "vid3",
        "vid4", "vid4", "vid5", "vid5", "vid6",
        "vid7", "vid8", "vid9", "vid10", "vid10"
    ],
    "comment_id": [
        "cid1", "cid2", "cid3", "cid4", "cid5",
        "cid6", "cid7", "cid8", "cid9", "cid10",
        "cid11", "cid12", "cid13", "cid14", "cid15"
    ],
    "text": [
        "I love this video, it's amazing!",
        "This is the worst video I've ever seen.",
        "Nice content, keep it up!",
        "I don't like this, it's boring.",
        "Great job, very informative.",
        "The video was okay, but could be better.",
        "Absolutely fantastic! Can't get enough of this.",
        "Not my cup of tea, too slow-paced.",
        "Loved every second of it, please make more!",
        "Terrible quality, not worth my time.",
        "Incredible visuals and sound design!",
        "Meh, nothing special about it.",
        "Highly recommend this to everyone!",
        "Boring and predictable, didn't enjoy it.",
        "One of the best videos I've seen this year!"
    ],
    "author": [
        "user1", "user2", "user3", "user4", "user5",
        "user6", "user7", "user8", "user9", "user10",
        "user11", "user12", "user13", "user14", "user15"
    ],
    "published_at": [
        "2023-10-01T12:00:00Z", "2023-10-02T13:00:00Z", "2023-10-03T14:00:00Z",
        "2023-10-04T15:00:00Z", "2023-10-05T16:00:00Z", "2023-10-06T17:00:00Z",
        "2023-10-07T18:00:00Z", "2023-10-08T19:00:00Z", "2023-10-09T20:00:00Z",
        "2023-10-10T21:00:00Z", "2023-10-11T22:00:00Z", "2023-10-12T23:00:00Z",
        "2023-10-13T00:00:00Z", "2023-10-14T01:00:00Z", "2023-10-15T02:00:00Z"
    ],
    "like_count": [10, 2, 15, 1, 20, 5, 30, 3, 25, 1, 18, 4, 35, 2, 40],
    "sentiment": [
        "positive", "negative", "positive", "negative", "positive",
        "neutral", "positive", "negative", "positive", "negative",
        "positive", "neutral", "positive", "negative", "positive"
    ]
}

# Создаем исходный DataFrame
df = pd.DataFrame(data)

# Функция для генерации новых данных
def generate_new_data(original_df, target_size):
    new_data = []
    video_ids = original_df['video_id'].unique()
    authors = original_df['author'].unique()
    texts = original_df['text'].unique()
    sentiments = original_df['sentiment'].unique()

    while len(new_data) < target_size:
        video_id = random.choice(video_ids)
        comment_id = f"cid{len(new_data) + 1}"
        text = random.choice(texts)
        author = random.choice(authors)
        published_at = (datetime.now() - timedelta(days=random.randint(0, 365))).isoformat() + "Z"
        like_count = random.randint(0, 50)
        sentiment = random.choice(sentiments)

        new_data.append({
            "video_id": video_id,
            "comment_id": comment_id,
            "text": text,
            "author": author,
            "published_at": published_at,
            "like_count": like_count,
            "sentiment": sentiment
        })

    return pd.DataFrame(new_data)

# Генерация нового DataFrame с 200 строками
new_df = generate_new_data(df, 200)

# Сохранение в CSV
new_df.to_csv('comments_dataset.csv', index=False)

# Проверка первых 5 строк
print(new_df.head())

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, language='en', remove_html=True, remove_stopwords=True,
                 lemmatize=True, stem=False, lowercase=True):
        """
        Инициализация параметров предобработки текста.

        :param language: Язык текста ('en' или 'ru').
        :param remove_html: Удалять ли HTML-теги.
        :param remove_stopwords: Удалять ли стоп-слова.
        :param lemmatize: Применять ли лемматизацию.
        :param stem: Применять ли стемминг.
        :param lowercase: Приводить ли текст к нижнему регистру.
        """
        self.language = language
        self.remove_html = remove_html
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.stem = stem
        self.lowercase = lowercase
        self.stop_words = set(stopwords.words('english' if language == 'en' else 'russian'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Применение предобработки к текстовым данным.

        :param X: Список текстов для предобработки.
        :return: Список предобработанных текстов.
        """
        return [self._preprocess_text(text) for text in X]

    def _preprocess_text(self, text):
        """
        Предобработка одного текста.

        :param text: Исходный текст.
        :return: Предобработанный текст.
        """
        # 1. Удаление HTML-тегов
        if self.remove_html:
            text = re.sub(r'<.*?>', '', text)

        # 2. Приведение к нижнему регистру
        if self.lowercase:
            text = text.lower()

        # 3. Удаление пунктуации и специальных символов
        text = re.sub(r'[^\w\s]', '', text)

        # 4. Токенизация текста
        tokens = nltk.word_tokenize(text)

        # 5. Удаление стоп-слов
        if self.remove_stopwords:
            tokens = [word for word in tokens if word not in self.stop_words]

        # 6. Лемматизация
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        # 7. Стемминг
        if self.stem:
            tokens = [self.stemmer.stem(word) for word in tokens]

        # Возвращаем обработанный текст
        return ' '.join(tokens)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
from transformers import BertTokenizer, BertModel

class UniversalTextVectorizer:
    def __init__(self, vectorization_type='tf_idf', max_features=1000, language='en'):
        self.vectorization_type = vectorization_type
        self.max_features = max_features
        self.language = language
        self.vectorizer = None

    def fit_transform(self, X):
        if self.vectorization_type == 'one_hot':
            self.vectorizer = CountVectorizer(max_features=self.max_features, binary=True)
        elif self.vectorization_type == 'bow':
            self.vectorizer = CountVectorizer(max_features=self.max_features)
        elif self.vectorization_type == 'tf_idf':
            self.vectorizer = TfidfVectorizer(max_features=self.max_features)
        elif self.vectorization_type == 'word2vec':
            self.vectorizer = self._train_word2vec(X)
        elif self.vectorization_type == 'doc2vec':
            self.vectorizer = self._train_doc2vec(X)
        elif self.vectorization_type == 'fasttext_gensim':
            self.vectorizer = self._train_fasttext(X)
        elif self.vectorization_type == 'bert':
            return self._transform_with_bert(X)
        elif self.vectorization_type == 'multilingual_bert':
            return self._transform_with_multilingual_bert(X)
        elif self.vectorization_type == 'use':
            raise NotImplementedError("Universal Sentence Encoder implementation is not provided in this version")
        else:
            raise ValueError(f"Unsupported vectorization type: {self.vectorization_type}")

        if self.vectorization_type in ['one_hot', 'bow', 'tf_idf']:
            return self.vectorizer.fit_transform(X)
        else:
            # For Word2Vec, Doc2Vec, and FastText, compute document-level embeddings
            return self.transform(X)

    def transform(self, X):
        if not hasattr(self, 'vectorizer') or self.vectorizer is None:
            raise ValueError("Vectorizer has not been fitted yet. Call fit_transform first.")

        if self.vectorization_type in ['one_hot', 'bow', 'tf_idf']:
            return self.vectorizer.transform(X)
        elif self.vectorization_type == 'word2vec':
            return np.array([self._get_word2vec_vector(doc) for doc in X])
        elif self.vectorization_type == 'doc2vec':
            return np.array([self.vectorizer.infer_vector(doc.split()) for doc in X])
        elif self.vectorization_type == 'fasttext_gensim':
            return np.array([self._get_fasttext_vector(doc) for doc in X])
        elif self.vectorization_type == 'bert':
            return self._transform_with_bert(X)
        elif self.vectorization_type == 'multilingual_bert':
            return self._transform_with_multilingual_bert(X)
        elif self.vectorization_type == 'use':
            raise NotImplementedError("Universal Sentence Encoder implementation is not provided in this version")

    def _train_word2vec(self, X):
        tokenized_X = [doc.split() for doc in X]
        model = Word2Vec(sentences=tokenized_X, vector_size=100, window=5, min_count=1, workers=4)
        return model

    def _get_word2vec_vector(self, doc):
        """
        Compute the average Word2Vec vector for a document.
        """
        tokens = doc.split()
        vectors = [self.vectorizer.wv[token] for token in tokens if token in self.vectorizer.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vectorizer.vector_size)

    def _train_doc2vec(self, X):
        tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(X)]
        model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4, epochs=40)
        return model

    def _train_fasttext(self, X):
        tokenized_X = [doc.split() for doc in X]
        model = FastText(sentences=tokenized_X, vector_size=100, window=5, min_count=1, workers=4, sg=1)
        return model

    def _get_fasttext_vector(self, doc):
        """
        Compute the average FastText vector for a document.
        """
        tokens = doc.split()
        vectors = [self.vectorizer.wv[token] for token in tokens if token in self.vectorizer.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vectorizer.vector_size)

    def _transform_with_bert(self, X):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        embeddings = []
        for text in X:
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
        return np.vstack(embeddings)

    def _transform_with_multilingual_bert(self, X):
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertModel.from_pretrained('bert-base-multilingual-cased')
        embeddings = []
        for text in X:
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
        return np.vstack(embeddings)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import os
import base64
from sklearn.datasets import fetch_20newsgroups
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import label_binarize
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc,
)
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

class DLTextClassificationPipeline:
    def __init__(self, X=None, y=None, datasetPath=None, label_column=None, text_column=None,
                 vectorization_type='tf_idf', output_dir="dl_classification_models",
                 max_features=1000, max_sequence_length=200, language='en', sample_size=50,
                 model_list=None, return_type='path'):
        self.X = X
        self.y = y
        self.datasetPath = datasetPath
        self.label_column = label_column
        self.text_column = text_column
        self.vectorization_type = vectorization_type
        self.output_dir = output_dir
        self.max_features = max_features
        self.max_sequence_length = max_sequence_length
        self.language = language
        self.sample_size = sample_size
        self.return_type = return_type
        os.makedirs(self.output_dir, exist_ok=True)
        self.model_paths = {}
        self.graph_paths = {}

        # Доступные модели глубокого обучения
        self.all_models = {
            "MLP": self.build_mlp,
            "CNN": self.build_cnn,
            "RNN": self.build_rnn,
            "LSTM": self.build_lstm,
            "GRU": self.build_gru,
            "BiRNN": self.build_birnn,
            "BiLSTM": self.build_bilstm,
            "CNN_LSTM": self.build_cnn_lstm,
            "Transformer": self.build_transformer
        }
        # Параметры моделей
        self.model_params = {
            "MLP": {"embedding_dim": 64, "hidden_units": [128, 64], "dropout_rate": 0.3},
            "CNN": {"embedding_dim": 64, "filters": 64, "kernel_size": 5, "pool_size": 4},
            "RNN": {"embedding_dim": 64, "rnn_units": 64, "dropout_rate": 0.2},
            "LSTM": {"embedding_dim": 64, "lstm_units": 64, "dropout_rate": 0.2},
            "GRU": {"embedding_dim": 64, "gru_units": 64, "dropout_rate": 0.2},
            "BiRNN": {"embedding_dim": 64, "rnn_units": 64, "dropout_rate": 0.2},
            "BiLSTM": {"embedding_dim": 64, "lstm_units": 64, "dropout_rate": 0.2},
            "CNN_LSTM": {
                "embedding_dim": 64,
                "filters": 64,
                "kernel_size": 5,
                "pool_size": 4,
                "lstm_units": 64
            },
            "Transformer": {
                "model_name": "bert-base-uncased",
                "max_length": self.max_sequence_length,
                "trainable": False
            }
        }

        # Фильтрация моделей по списку
        if model_list is None:
            self.models = self.all_models
        else:
            available = set(self.all_models.keys())
            for model in model_list:
                if model not in available:
                    raise ValueError(f"Модель '{model}' недоступна. Доступные модели: {available}")
            self.models = {name: self.all_models[name] for name in model_list}

        self.results = []
        self.confusion_matrices = {}
        self.roc_curves = {}
        self.tokenizer = None
        self.label_encoder = None

    def preprocess_data(self, df):
        df = df.dropna(subset=[self.label_column])
        df = df.drop_duplicates()
        return df

    def load_and_split_data(self):
        if self.X is not None and self.y is not None:
            print("Данные уже загружены через X и y.")
            data_loaded = True
        elif self.datasetPath is None:
            print("Загрузка данных из fetch_20newsgroups...")
            newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
            self.X = newsgroups.data
            self.y = newsgroups.target
            self.label_mapping = dict(enumerate(newsgroups.target_names))
            print(f"Маппинг меток: {self.label_mapping}")
            data_loaded = True
        elif self.datasetPath is not None and self.label_column is not None:
            if not os.path.exists(self.datasetPath):
                raise FileNotFoundError(f"Файл не найден: {self.datasetPath}")
            print(f"Загрузка данных из {self.datasetPath}...")
            df = pd.read_csv(self.datasetPath)
            print(df.head(5))
            print(df.columns)
            if self.label_column not in df.columns:
                raise ValueError(f"Целевой столбец '{self.label_column}' не найден в датасете.")
            if self.text_column is not None and self.text_column not in df.columns:
                raise ValueError(f"Текстовый столбец '{self.text_column}' не найден в датасете.")

            self.X = df[self.text_column].values if self.text_column else df.drop(columns=[self.label_column]).values.flatten()
            self.y = df[self.label_column].values

            self.label_encoder = LabelEncoder()
            self.y = self.label_encoder.fit_transform(self.y)
            self.label_mapping = dict(zip(range(len(self.label_encoder.classes_)), self.label_encoder.classes_))
            print(f"Маппинг меток: {self.label_mapping}")
            data_loaded = True
        else:
            raise ValueError("Необходимо передать либо X и y, либо datasetPath и label_column.")

        if data_loaded and self.sample_size is not None:
            if not (0 < self.sample_size <= 100):
                raise ValueError("Параметр sample_size должен быть в диапазоне от 0 до 100.")
            sample_fraction = self.sample_size / 100.0
            sample_size = int(len(self.X) * sample_fraction)
            self.X = self.X[:sample_size]
            self.y = self.y[:sample_size]

        # Проверяем количество уникальных классов
        unique_classes = np.unique(self.y)
        self.n_classes = len(unique_classes)
        print(f"Обнаружено классов: {self.n_classes} ({unique_classes})")

        # Стратифицированное разделение
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
        )

        # Преобразуем в one-hot encoding
        self.y_train_cat = tf.keras.utils.to_categorical(self.y_train, num_classes=self.n_classes)
        self.y_test_cat = tf.keras.utils.to_categorical(self.y_test, num_classes=self.n_classes)

    def train_model(self, name, model, X_train=None, y_train=None, epochs=10, batch_size=32):
        start_time = time.time()

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        checkpoint = ModelCheckpoint(
            os.path.join(self.output_dir, f"{name}_best.keras"),
            monitor='val_accuracy',
            save_best_only=True,
            mode='max'
        )

        if name == "Transformer":
            model, train_dataset, test_dataset = model
            history = model.fit(
                train_dataset.shuffle(1000).batch(batch_size),
                epochs=epochs,
                validation_data=test_dataset.batch(batch_size),
                callbacks=[early_stopping, checkpoint],
                verbose=1
            )
        else:
            history = model.fit(
                X_train, y_train,
                batch_size=batch_size,
                epochs=epochs,
                validation_split=0.2,
                callbacks=[early_stopping, checkpoint],
                verbose=1
            )

        model_path = os.path.join(self.output_dir, f"{name}_model.keras")
        model.save(model_path)
        self.model_paths[name] = model_path

        # Предсказания
        if name == "Transformer":
            y_pred = np.argmax(model.predict(test_dataset.batch(batch_size)), axis=1)
            y_prob = model.predict(test_dataset.batch(batch_size))
        else:
            y_pred = np.argmax(model.predict(self.X_test_pad), axis=1)
            y_prob = model.predict(self.X_test_pad)

        # Метрики
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(self.y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(self.y_test, y_pred, average='weighted', zero_division=0)
        elapsed_time = time.time() - start_time

        self.results.append({
            "Model": name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Training Time (s)": elapsed_time,
            "Epochs": len(history.history['loss'])
        })

        # Confusion matrix
        self.confusion_matrices[name] = confusion_matrix(self.y_test, y_pred)

        # ROC curves (только для бинарной и мультиклассовой классификации)
        if self.n_classes > 1:
            fpr, tpr, roc_auc = {}, {}, {}
            for i in range(y_prob.shape[1]):
                try:
                    if len(np.unique(self.y_test_cat[:, i])) > 1:  # Нужно как минимум 2 разных значения
                        fpr[i], tpr[i], _ = roc_curve(self.y_test_cat[:, i], y_prob[:, i])
                        roc_auc[i] = auc(fpr[i], tpr[i])
                except Exception as e:
                    print(f"Не удалось вычислить ROC для класса {i}: {str(e)}")
            self.roc_curves[name] = (fpr, tpr, roc_auc)

        return history

    def preprocess_text(self):
        preprocessor = TextPreprocessor(
            language=self.language,
            remove_html=True,
            remove_stopwords=True,
            lemmatize=True,
            stem=False,
            lowercase=True
        )

        self.X_train = preprocessor.transform(self.X_train)
        self.X_test = preprocessor.transform(self.X_test)

    def vectorize_text(self):
        if self.vectorization_type == "transformer":
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_params["Transformer"]["model_name"])
            return

        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.max_features)
        self.tokenizer.fit_on_texts(self.X_train)

        self.X_train_seq = self.tokenizer.texts_to_sequences(self.X_train)
        self.X_test_seq = self.tokenizer.texts_to_sequences(self.X_test)

        self.X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(self.X_train_seq, maxlen=self.max_sequence_length)
        self.X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(self.X_test_seq, maxlen=self.max_sequence_length)

        # Сохраняем токенизатор
        tokenizer_path = os.path.join(self.output_dir, "tokenizer.pkl")
        joblib.dump(self.tokenizer, tokenizer_path)
        self.model_paths['Tokenizer'] = tokenizer_path

    def build_mlp(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["MLP"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.Dropout(self.model_params["MLP"]["dropout_rate"]))
        for units in self.model_params["MLP"]["hidden_units"]:
            model.add(tf.keras.layers.Dense(units, activation='relu'))
            model.add(tf.keras.layers.Dropout(self.model_params["MLP"]["dropout_rate"]))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_cnn(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["CNN"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.Conv1D(self.model_params["CNN"]["filters"],
                      self.model_params["CNN"]["kernel_size"],
                      activation='relu'))
        model.add(tf.keras.layers.GlobalMaxPooling1D())
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_rnn(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["RNN"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.SimpleRNN(self.model_params["RNN"]["rnn_units"], dropout=self.model_params["RNN"]["dropout_rate"]))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_lstm(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["LSTM"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.LSTM(self.model_params["LSTM"]["lstm_units"], dropout=self.model_params["LSTM"]["dropout_rate"]))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_gru(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["GRU"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.GRU(self.model_params["GRU"]["gru_units"], dropout=self.model_params["GRU"]["dropout_rate"]))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_birnn(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["BiRNN"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(self.model_params["BiRNN"]["rnn_units"], dropout=self.model_params["BiRNN"]["dropout_rate"])))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_bilstm(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["BiLSTM"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.model_params["BiLSTM"]["lstm_units"], dropout=self.model_params["BiLSTM"]["dropout_rate"])))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_cnn_lstm(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self.max_features, self.model_params["CNN_LSTM"]["embedding_dim"], input_length=self.max_sequence_length))
        model.add(tf.keras.layers.Conv1D(self.model_params["CNN_LSTM"]["filters"],
                      self.model_params["CNN_LSTM"]["kernel_size"],
                      activation='relu'))
        model.add(tf.keras.layers.LSTM(self.model_params["CNN_LSTM"]["lstm_units"]))
        model.add(tf.keras.layers.Dense(self.n_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def build_transformer(self):
        # Загрузка предобученной модели и токенизатора
        transformer_model = TFAutoModel.from_pretrained(self.model_params["Transformer"]["model_name"])
        tokenizer = AutoTokenizer.from_pretrained(self.model_params["Transformer"]["model_name"])

        # Токенизация текста
        train_encodings = tokenizer(self.X_train.tolist(), truncation=True, padding=True, max_length=self.max_sequence_length)
        test_encodings = tokenizer(self.X_test.tolist(), truncation=True, padding=True, max_length=self.max_sequence_length)

        # Создание tf.data.Dataset
        train_dataset = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings),
            self.y_train_cat
        ))
        test_dataset = tf.data.Dataset.from_tensor_slices((
            dict(test_encodings),
            self.y_test_cat
        ))

        # Входные слои
        input_ids = tf.keras.layers.Input(shape=(self.max_sequence_length,), dtype=tf.int32, name="input_ids")
        attention_mask = tf.keras.layers.Input(shape=(self.max_sequence_length,), dtype=tf.int32, name="attention_mask")

        # Получение выходов трансформера
        transformer_model.trainable = self.model_params["Transformer"]["trainable"]
        sequence_output = transformer_model(input_ids=input_ids, attention_mask=attention_mask)[0]

        # Добавление классификатора поверх трансформера
        cls_token = sequence_output[:, 0, :]
        output = tf.keras.layers.Dense(self.n_classes, activation='softmax')(cls_token)

        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

        return model, train_dataset, test_dataset

    def train_models(self, epochs=10, batch_size=32):
        """Этот метод остается БЕЗ ИЗМЕНЕНИЙ"""
        for name, builder in self.models.items():
            print(f"\nTraining {name} model...")
            if name == "Transformer":
                model, train_dataset, test_dataset = builder()
                history = self.train_model(name, (model, train_dataset, test_dataset),
                          epochs=epochs, batch_size=batch_size)
            else:
                model = builder()
                history = self.train_model(name, model, self.X_train_pad, self.y_train_cat,
                          epochs=epochs, batch_size=batch_size)

            self.plot_training_history(name, history)

    def plot_training_history(self, name, history):
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Train Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title(f'{name} Model Accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'{name} Model Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()

        plt.tight_layout()

        if self.return_type == 'path':
            graph_path = os.path.join(self.output_dir, f"{name}_training_history.png")
            plt.savefig(graph_path)
            self.graph_paths[f'{name}_history'] = graph_path
            plt.close()
        else:
            plt.show()

    def save_results(self):
        self.results_df = pd.DataFrame(self.results)
        results_path = os.path.join(self.output_dir, "dl_model_results.csv")
        self.results_df.to_csv(results_path, index=False)
        self.graph_paths['results_csv'] = results_path

    def plot_confusion_matrices(self):
        num_models = len(self.confusion_matrices)
        fig, axes = plt.subplots(num_models, 1, figsize=(10, 5 * num_models))
        for i, (name, conf_matrix) in enumerate(self.confusion_matrices.items()):
            ax = axes[i] if num_models > 1 else axes
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", ax=ax)
            ax.set_title(f"Confusion Matrix for {name}")
            ax.set_xlabel("Predicted Labels")
            ax.set_ylabel("True Labels")
        plt.tight_layout()
        if self.return_type == 'path':
            graph_path = os.path.join(self.output_dir, "dl_confusion_matrices.png")
            plt.savefig(graph_path)
            self.graph_paths['confusion_matrices'] = graph_path
            plt.close(fig)
            return graph_path
        else:
            return fig

    def plot_roc_curves(self):
        plt.figure(figsize=(10, 8))
        for name, (fpr, tpr, roc_auc) in self.roc_curves.items():
            for i in range(self.n_classes):
                plt.plot(fpr[i], tpr[i], label=f"{name} (Class {i}, AUC={roc_auc[i]:.2f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves for All Models")
        plt.legend(loc="lower right")
        if self.return_type == 'path':
            graph_path = os.path.join(self.output_dir, "dl_roc_curves.png")
            plt.savefig(graph_path)
            self.graph_paths['roc_curves'] = graph_path
            plt.close()
            return graph_path
        else:
            return plt.gcf()

    def plot_metric_comparison(self, metric_name):
        metric_df = self.results_df[["Model", metric_name]].set_index("Model")
        plt.figure(figsize=(12, 6))
        metric_df.plot(kind='bar', figsize=(12, 6))
        plt.title(f"Comparison of Model {metric_name}")
        plt.ylabel(metric_name)
        plt.xlabel("Model")
        plt.xticks(rotation=45)
        plt.tight_layout()
        if self.return_type == 'path':
            graph_path = os.path.join(self.output_dir, f"dl_{metric_name}_comparison.png")
            plt.savefig(graph_path)
            self.graph_paths[f'{metric_name}_comparison'] = graph_path
            plt.close()
            return graph_path
        else:
            return plt.gcf()

    def generate_html_report(self):
        html_content = """
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Deep Learning Text Classification Report</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                h1, h2, h3, h4 { color: #333; }
                table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
                th, td { border: 1px solid #ddd; padding: 8px; text-align: center; }
                th { background-color: #f4f4f4; }
                img { max-width: 100%; height: auto; margin-bottom: 10px; border: 1px solid #eee; }
                .model-section {
                    margin-bottom: 40px;
                    padding: 15px;
                    background: #f9f9f9;
                    border-radius: 5px;
                }
                .best-model {
                    background-color: #e9f7ef;
                    padding: 20px;
                    border-left: 5px solid #28a745;
                    margin-bottom: 30px;
                    border-radius: 5px;
                }
                .plot-grid {
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
                    gap: 20px;
                    margin-top: 20px;
                }
                .plot-container {
                    padding: 10px;
                    background: white;
                    border-radius: 5px;
                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
                }
                @media (max-width: 600px) {
                    .plot-grid { grid-template-columns: 1fr; }
                }
            </style>
        </head>
        <body>
            <h1>Deep Learning Text Classification Report</h1>
        """

        # 1. Общая информация о данных
        html_content += "<div class='model-section'>"
        html_content += "<h2>Dataset Information</h2>"
        html_content += f"<p><strong>Total samples:</strong> {len(self.X)}</p>"
        html_content += f"<p><strong>Number of classes:</strong> {self.n_classes}</p>"
        html_content += f"<p><strong>Class distribution:</strong> {dict(zip(self.label_encoder.classes_, np.bincount(self.y)))}</p>"
        html_content += "</div>"

        # 2. Таблица с метриками всех моделей
        html_content += "<div class='model-section'>"
        html_content += "<h2>Model Performance Summary</h2>"
        html_content += self.results_df.to_html(index=False, classes="metrics-table")
        html_content += "</div>"

        # 3. Детали по каждой модели
        for model_name in self.models.keys():
            html_content += f"<div class='model-section'>"
            html_content += f"<h3>{model_name} Model</h3>"

            # Метрики модели
            model_metrics = self.results_df[self.results_df['Model'] == model_name]
            html_content += model_metrics.to_html(index=False, classes="metrics-table")

            # Графики модели
            html_content += "<div class='plot-grid'>"

            # 1. График обучения
            history_path = os.path.join(self.output_dir, f"{model_name}_training_history.png")
            if os.path.exists(history_path):
                with open(history_path, "rb") as f:
                    history_base64 = base64.b64encode(f.read()).decode("utf-8")
                html_content += f"""
                <div class='plot-container'>
                    <h4>Training History</h4>
                    <img src="data:image/png;base64,{history_base64}" alt="Training History">
                </div>
                """

            # 2. Матрица ошибок
            if model_name in self.confusion_matrices:
                conf_matrix_path = os.path.join(self.output_dir, f"{model_name}_confusion_matrix.png")
                plt.figure(figsize=(8, 6))
                sns.heatmap(self.confusion_matrices[model_name], annot=True, fmt="d", cmap="Blues")
                plt.title(f"Confusion Matrix - {model_name}")
                plt.savefig(conf_matrix_path)
                plt.close()
                with open(conf_matrix_path, "rb") as f:
                    conf_base64 = base64.b64encode(f.read()).decode("utf-8")
                html_content += f"""
                <div class='plot-container'>
                    <h4>Confusion Matrix</h4>
                    <img src="data:image/png;base64,{conf_base64}" alt="Confusion Matrix">
                </div>
                """

            # 3. ROC-кривые (если есть)
            if hasattr(self, 'roc_curves') and model_name in self.roc_curves:
                roc_path = os.path.join(self.output_dir, f"{model_name}_roc_curve.png")
                plt.figure(figsize=(8, 6))
                fpr, tpr, roc_auc = self.roc_curves[model_name]
                for i in fpr:
                    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')
                plt.plot([0, 1], [0, 1], 'k--')
                plt.title(f'ROC Curves - {model_name}')
                plt.legend()
                plt.savefig(roc_path)
                plt.close()
                with open(roc_path, "rb") as f:
                    roc_base64 = base64.b64encode(f.read()).decode("utf-8")
                html_content += f"""
                <div class='plot-container'>
                    <h4>ROC Curves</h4>
                    <img src="data:image/png;base64,{roc_base64}" alt="ROC Curves">
                </div>
                """

            html_content += "</div></div>"  # Закрываем plot-grid и model-section

        # 4. Лучшая модель
        best_model = self.results_df.loc[self.results_df['F1-Score'].idxmax()]
        best_model_name = best_model['Model']
        html_content += "<div class='best-model'>"
        html_content += f"<h2>🌟 Best Performing Model: {best_model_name} 🌟</h2>"

        # Таблица метрик лучшей модели
        html_content += best_model.to_frame().to_html(header=False, classes="metrics-table")

        # Графики лучшей модели
        html_content += "<div class='plot-grid'>"

        # 1. История обучения
        history_path = os.path.join(self.output_dir, f"{best_model_name}_training_history.png")
        if os.path.exists(history_path):
            with open(history_path, "rb") as f:
                history_base64 = base64.b64encode(f.read()).decode("utf-8")
            html_content += f"""
            <div class='plot-container'>
                <h4>Training History</h4>
                <img src="data:image/png;base64,{history_base64}" alt="Training History">
            </div>
            """

        # 2. Матрица ошибок
        if best_model_name in self.confusion_matrices:
            conf_matrix_path = os.path.join(self.output_dir, f"{best_model_name}_confusion_matrix.png")
            with open(conf_matrix_path, "rb") as f:
                conf_base64 = base64.b64encode(f.read()).decode("utf-8")
            html_content += f"""
            <div class='plot-container'>
                <h4>Confusion Matrix</h4>
                <img src="data:image/png;base64,{conf_base64}" alt="Confusion Matrix">
            </div>
            """

        # 3. ROC-кривые
        if hasattr(self, 'roc_curves') and best_model_name in self.roc_curves:
            roc_path = os.path.join(self.output_dir, f"{best_model_name}_roc_curve.png")
            with open(roc_path, "rb") as f:
                roc_base64 = base64.b64encode(f.read()).decode("utf-8")
            html_content += f"""
            <div class='plot-container'>
                <h4>ROC Curves</h4>
                <img src="data:image/png;base64,{roc_base64}" alt="ROC Curves">
            </div>
            """

        html_content += "</div></div>"  # Закрываем plot-grid и best-model

        # Закрываем HTML
        html_content += """
        </body>
        </html>
        """

        # Сохраняем отчет
        report_path = os.path.join(self.output_dir, "classification_report.html")
        with open(report_path, "w", encoding="utf-8") as f:
            f.write(html_content)

        print(f"Report generated: {report_path}")
        return html_content

    def run_pipeline(self, epochs=10, batch_size=32):
        # Загрузка данных и разделение на обучающую и тестовую выборки
        self.load_and_split_data()

        # Предобработка текста
        self.preprocess_text()

        # Векторизация текста
        self.vectorize_text()

        # Обучение моделей
        self.train_models(epochs=epochs, batch_size=batch_size)

        # Сохранение результатов
        self.save_results()

        # Построение графиков
        self.plot_confusion_matrices()
        self.plot_roc_curves()

        # Графики сравнения метрик
        for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Training Time (s)']:
            self.plot_metric_comparison(metric)

        # Генерация HTML-отчета
        html_content = self.generate_html_report()

        # Возвращаем результаты
        return {
            'results_df': self.results_df,
            'model_paths': self.model_paths,
            'tokenizer_path': self.model_paths.get('Tokenizer'),
            'graph_paths': self.graph_paths,
            'html_content': html_content
        }

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
# Инициализация пайплайна с данными
pipeline = DLTextClassificationPipeline(
    datasetPath="/content/comments_dataset.csv",  # передается в родительский класс
    text_column="text",           # передается в родительский класс
    label_column="sentiment",     # передается в родительский класс
    model_list=["LSTM"],  # какие модели использовать
    output_dir="results"   # куда сохранять результаты
)

# Запуск пайплайна
results = pipeline.run_pipeline(epochs=10, batch_size=32)

In [None]:
class SocialMediaAnalysisPipeline(DLTextClassificationPipeline):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.temporal_analysis_results = {}
        self.full_dataset = None

    def load_and_split_data(self):
        """Загрузка данных с проверкой всех необходимых столбцов"""
        try:
            if self.datasetPath:
                # Загрузка данных
                self.full_dataset = pd.read_csv(self.datasetPath)

                # Проверка наличия необходимых столбцов
                required_columns = {self.text_column, self.label_column, 'published_at'}
                if not required_columns.issubset(self.full_dataset.columns):
                    missing = required_columns - set(self.full_dataset.columns)
                    raise ValueError(f"Missing required columns: {missing}")

                # Преобразование дат
                self.full_dataset['published_at'] = pd.to_datetime(
                    self.full_dataset['published_at'],
                    errors='coerce'
                )
                # Удаление строк с некорректными датами
                self.full_dataset = self.full_dataset.dropna(subset=['published_at'])

                # Извлечение текста и меток
                self.X = self.full_dataset[self.text_column].values
                self.y = self.full_dataset[self.label_column].values

                # Кодирование меток
                self.label_encoder = LabelEncoder()
                self.y = self.label_encoder.fit_transform(self.y)
                self.label_mapping = dict(zip(self.label_encoder.classes_,
                                           range(len(self.label_encoder.classes_))))

                # Разделение данных
                self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                    self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
                )

                # One-hot кодирование
                self.n_classes = len(np.unique(self.y))
                self.y_train_cat = tf.keras.utils.to_categorical(self.y_train, num_classes=self.n_classes)
                self.y_test_cat = tf.keras.utils.to_categorical(self.y_test, num_classes=self.n_classes)
            else:
                super().load_and_split_data()

        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise

    def analyze_temporal_trends(self):
        """Анализ временных тенденций с защитой от ошибок"""
        try:
            if self.full_dataset is None or 'published_at' not in self.full_dataset.columns:
                print("Skipping temporal analysis - no datetime data available")
                return

            df = self.full_dataset.copy()
            df['sentiment'] = self.y

            # Извлекаем компоненты даты
            df['date'] = df['published_at'].dt.date
            df['hour'] = df['published_at'].dt.hour
            df['day_of_week'] = df['published_at'].dt.day_name()

            # Анализ по разным периодам
            self.temporal_analysis_results = {
                'daily': df.groupby(['date', 'sentiment']).size().unstack(fill_value=0),
                'hourly': df.groupby(['hour', 'sentiment']).size().unstack(fill_value=0),
                'weekly': df.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
            }

        except Exception as e:
            print(f"Error in temporal analysis: {str(e)}")

    def plot_temporal_trends(self):
        """Визуализация временных тенденций"""
        try:
            if not self.temporal_analysis_results:
                print("No temporal data to visualize")
                return

            os.makedirs(self.output_dir, exist_ok=True)

            for trend_type, data in self.temporal_analysis_results.items():
                plt.figure(figsize=(12, 6))

                if trend_type == 'daily':
                    data.plot(kind='line', title=f'{trend_type.capitalize()} Sentiment Trends')
                else:
                    if trend_type == 'weekly':
                        week_order = ['Monday', 'Tuesday', 'Wednesday',
                                    'Thursday', 'Friday', 'Saturday', 'Sunday']
                        data = data.loc[week_order]
                    data.plot(kind='bar', stacked=True,
                            title=f'{trend_type.capitalize()} Sentiment Distribution')

                plt.ylabel('Number of Comments')
                plt.tight_layout()
                path = os.path.join(self.output_dir, f"{trend_type}_trends.png")
                plt.savefig(path)
                plt.close()
                self.graph_paths[f"{trend_type}_trends"] = path

        except Exception as e:
            print(f"Error plotting temporal trends: {str(e)}")

    def save_model_results(self):
        """Сохранение результатов моделей"""
        try:
            if not hasattr(self, 'results'):
                self.results = []

            # Создаем DataFrame с результатами
            self.results_df = pd.DataFrame(self.results)

            # Сохраняем в файл
            results_path = os.path.join(self.output_dir, "model_results.csv")
            self.results_df.to_csv(results_path, index=False)
            self.graph_paths['results_csv'] = results_path

        except Exception as e:
            print(f"Error saving model results: {str(e)}")
            raise

    def generate_full_report(self):
        """Генерация полного отчета со всеми разделами, включая лучшую модель"""
        try:
            # 1. Сохраняем результаты моделей
            self.save_model_results()

            # 2. Создаем базовую структуру отчета
            html_content = """
            <!DOCTYPE html>
            <html>
            <head>
                <title>Complete Social Media Analysis Report</title>
                <style>
                    body { font-family: Arial, sans-serif; margin: 20px; }
                    h1, h2, h3 { color: #333; }
                    .section { margin-bottom: 30px; padding: 15px;
                              background: #f9f9f9; border-radius: 5px; }
                    .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
                            gap: 20px; margin-bottom: 20px; }
                    .plot-container { background: white; padding: 10px;
                                    border-radius: 5px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
                    table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }
                    th, td { border: 1px solid #ddd; padding: 8px; text-align: center; }
                    th { background-color: #f2f2f2; }
                    .best-model { background: #e8f5e9; padding: 20px;
                                border-left: 5px solid #2e7d32; margin: 30px 0; }
                    .model-metrics { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
                                  gap: 20px; }
                </style>
            </head>
            <body>
                <h1>Complete Social Media Analysis Report</h1>
            """

            # 3. Добавляем информацию о датасете
            html_content += """
            <div class="section">
                <h2>Dataset Information</h2>
                <p><strong>Total samples:</strong> """ + str(len(self.X)) + """</p>
                <p><strong>Number of classes:</strong> """ + str(self.n_classes) + """</p>
            </div>
            """

            # 4. Добавляем результаты классификации
            if hasattr(self, 'results_df'):
                html_content += """
                <div class="section">
                    <h2>Classification Results</h2>
                    """ + self.results_df.to_html(index=False, classes="metrics-table") + """
                </div>
                """

            # 5. Добавляем графики обучения
            html_content += """
            <div class="section">
                <h2>Model Training History</h2>
                <div class="grid">
            """
            for model_name in self.models.keys():
                history_path = os.path.join(self.output_dir, f"{model_name}_training_history.png")
                if os.path.exists(history_path):
                    with open(history_path, "rb") as f:
                        img = base64.b64encode(f.read()).decode("utf-8")
                    html_content += f"""
                    <div class="plot-container">
                        <h3>{model_name}</h3>
                        <img src="data:image/png;base64,{img}" style="width:100%">
                    </div>
                    """
            html_content += "</div></div>"

            # 6. Добавляем временные тенденции (если есть)
            if any(f"{t}_trends" in self.graph_paths for t in ['daily', 'hourly', 'weekly']):
                html_content += """
                <div class="section">
                    <h2>Temporal Sentiment Analysis</h2>
                    <div class="grid">
                """
                for trend_type in ['daily', 'hourly', 'weekly']:
                    if f"{trend_type}_trends" in self.graph_paths:
                        with open(self.graph_paths[f"{trend_type}_trends"], "rb") as f:
                            img = base64.b64encode(f.read()).decode("utf-8")
                        html_content += f"""
                        <div class="plot-container">
                            <h3>{trend_type.capitalize()} Trends</h3>
                            <img src="data:image/png;base64,{img}" style="width:100%">
                        </div>
                        """
                html_content += "</div></div>"

            # 7. Добавляем раздел лучшей модели
            if hasattr(self, 'results_df') and not self.results_df.empty:
                best_model = self.results_df.loc[self.results_df['F1-Score'].idxmax()]
                best_model_name = best_model['Model']

                html_content += f"""
                <div class="best-model">
                    <h2>Best Performing Model: {best_model_name}</h2>
                    <div class="model-metrics">
                        <div>
                            <h3>Metrics</h3>
                            <table>
                                <tr><th>F1-Score</th><td>{best_model['F1-Score']:.4f}</td></tr>
                                <tr><th>Accuracy</th><td>{best_model['Accuracy']:.4f}</td></tr>
                                <tr><th>Precision</th><td>{best_model['Precision']:.4f}</td></tr>
                                <tr><th>Recall</th><td>{best_model['Recall']:.4f}</td></tr>
                                <tr><th>Training Time</th><td>{best_model['Training Time (s)']:.2f} sec</td></tr>
                            </table>
                        </div>
                """

                # Графики обучения лучшей модели
                history_path = os.path.join(self.output_dir, f"{best_model_name}_training_history.png")
                if os.path.exists(history_path):
                    with open(history_path, "rb") as f:
                        img = base64.b64encode(f.read()).decode("utf-8")
                    html_content += f"""
                    <div class="plot-container">
                        <h3>Training History</h3>
                        <img src="data:image/png;base64,{img}" style="width:100%">
                    </div>
                    """

                # Матрица ошибок лучшей модели
                conf_matrix_path = os.path.join(self.output_dir, f"{best_model_name}_confusion.png")
                if os.path.exists(conf_matrix_path):
                    with open(conf_matrix_path, "rb") as f:
                        img = base64.b64encode(f.read()).decode("utf-8")
                    html_content += f"""
                    <div class="plot-container">
                        <h3>Confusion Matrix</h3>
                        <img src="data:image/png;base64,{img}" style="width:100%">
                    </div>
                    """

                html_content += "</div></div>"

            # 8. Закрываем HTML
            html_content += "</body></html>"

            # 9. Сохраняем отчет
            report_path = os.path.join(self.output_dir, "full_report.html")
            with open(report_path, "w", encoding="utf-8") as f:
                f.write(html_content)

            print(f"Report successfully generated: {report_path}")
            return html_content

        except Exception as e:
            print(f"Error generating report: {str(e)}")
            raise

    def run_pipeline(self, epochs=10, batch_size=32):
        """Полный защищенный цикл анализа"""
        try:
            # 1. Загрузка данных
            self.load_and_split_data()

            # 2. Предобработка текста
            self.preprocess_text()
            self.vectorize_text()

            # 3. Обучение моделей
            self.train_models(epochs=epochs, batch_size=batch_size)

            # 4. Анализ временных тенденций
            self.analyze_temporal_trends()

            # 5. Визуализация результатов
            self.plot_temporal_trends()

            # 6. Генерация отчета
            return self.generate_full_report()

        except Exception as e:
            print(f"Pipeline failed: {str(e)}")
            raise

In [None]:
# Инициализация пайплайна
pipeline = SocialMediaAnalysisPipeline(
    datasetPath="/content/comments_dataset.csv",
    text_column="text",
    label_column="sentiment",
    model_list=['CNN','RNN', 'LSTM','GRU','BiRNN']
)
#        ['MLP', 'CNN','RNN', 'LSTM','GRU','BiRNN','BiLSTM', 'CNN_LSTM',  'Transformer']
# Запуск полного анализа
results = pipeline.run_pipeline(epochs=10)