<a href="https://colab.research.google.com/github/CodeHunterOfficial/ABC_DataMining/blob/main/NV/%D0%90%D0%BD%D0%B0%D0%BB%D0%B8%D0%B7_%D0%B8_%D0%BE%D0%B1%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D0%BA%D0%B0_%D1%81%D0%BE%D1%86%D0%B8%D0%B0%D0%BB%D1%8C%D0%BD%D1%8B%D1%85_%D0%BC%D0%B5%D0%B4%D0%B8%D0%B0_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2_%D0%B4%D0%BB%D1%8F_%D0%B2%D1%8B%D1%8F%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F_%D1%82%D0%B5%D0%BD%D0%B4%D0%B5%D0%BD%D1%86%D0%B8%D0%B9_%D0%B8_%D0%BD%D0%B0%D1%81%D1%82%D1%80%D0%BE%D0%B5%D0%BD%D0%B8%D0%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Импортируем необходимые библиотеки
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import pipeline

# Скачиваем необходимые ресурсы NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Загрузка данных https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/data
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

# Предобработка текста
def preprocess_text(text):
    # Удаление ссылок, хэштегов, упоминаний и специальных символов
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Оставляем только буквы
    text = text.lower()  # Преобразуем в нижний регистр

    # Токенизация, удаление стоп-слов и лемматизация
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Применяем предобработку к столбцу "text"
airline_tweets['clean_lemmatized_text'] = airline_tweets['text'].apply(preprocess_text)

# Векторизация текста с помощью TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(airline_tweets['clean_lemmatized_text']).toarray()

# Кодирование меток классов
y = airline_tweets['airline_sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Массив моделей
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', random_state=42))
]

# Функция для оценки модели
def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Визуализация матрицы ошибок
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
                xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix ({name})')
    plt.show()

# Функция для построения графиков обучения
def plot_learning_curve(model, X, y, name):
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training Accuracy', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean, label='Validation Accuracy', color='green', marker='s')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='green')

    plt.title(f'Learning Curve ({name})')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

# Тематическое моделирование (LDA)
def topic_modeling():
    tfidf_vectorizer = TfidfVectorizer(max_features=3000)
    X_tfidf = tfidf_vectorizer.fit_transform(airline_tweets['clean_lemmatized_text'])

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X_tfidf)

    def display_topics(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f"Topic #{topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

# Анализ трендов во времени
def sentiment_trends_over_time():
    airline_tweets['tweet_created'] = pd.to_datetime(airline_tweets['tweet_created'])
    sentiment_over_time = airline_tweets.groupby([pd.Grouper(key='tweet_created', freq='D'), 'airline_sentiment']).size().unstack(fill_value=0)

    plt.figure(figsize=(12, 6))
    for sentiment in ['positive', 'neutral', 'negative']:
        plt.plot(sentiment_over_time.index, sentiment_over_time[sentiment], label=sentiment.capitalize())

    plt.title('Sentiment Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Tweets')
    plt.legend()
    plt.show()

# Сравнение тональности между авиакомпаниями
def sentiment_by_airline():
    sentiment_by_airline = airline_tweets.groupby(['airline', 'airline_sentiment']).size().unstack(fill_value=0)
    sentiment_by_airline = sentiment_by_airline.div(sentiment_by_airline.sum(axis=1), axis=0)

    sentiment_by_airline.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title('Sentiment Distribution by Airline')
    plt.xlabel('Airline')
    plt.ylabel('Proportion of Sentiment')
    plt.show()

# Интерактивная визуализация
def interactive_visualization():
    fig = px.bar(sentiment_by_airline.reset_index(), x='airline', y=['positive', 'neutral', 'negative'],
                 title='Sentiment Distribution by Airline', labels={'value': 'Proportion'})
    fig.show()

# Тестирование на новых данных
def predict_sentiment(new_texts, vectorizer, model):
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed).toarray()
    predictions = model.predict(new_texts_vectorized)
    sentiment_map = {2: 'positive', 1: 'neutral', 0: 'negative'}
    return [sentiment_map[pred] for pred in predictions]

# Основной блок кода
if __name__ == "__main__":
    # Оценка каждой модели
    for name, model in models:
        print(f"Evaluating {name}...")
        evaluate_model(model, name, X_train, X_test, y_train, y_test)
        plot_learning_curve(model, X_train, y_train, name)

    # Тематическое моделирование
    print("\nTopic Modeling (LDA):")
    topic_modeling()

    # Анализ трендов во времени
    print("\nSentiment Trends Over Time:")
    sentiment_trends_over_time()

    # Сравнение тональности между авиакомпаниями
    print("\nSentiment Distribution by Airline:")
    sentiment_by_airline()

    # Интерактивная визуализация
    print("\nInteractive Visualization:")
    interactive_visualization()

    # Тестирование на новых данных
    new_tweets = [
        "I love the service provided by Virgin America!",
        "The flight was delayed and the staff was rude.",
        "The seats were comfortable but the food was terrible."
    ]
    svm_model = models[3][1]  # SVM модель
    predicted_sentiments = predict_sentiment(new_tweets, vectorizer, svm_model)
    print("\nPredicted Sentiments for New Tweets:")
    for tweet, sentiment in zip(new_tweets, predicted_sentiments):
        print(f"Tweet: \"{tweet}\" → Sentiment: {sentiment}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


HTTPError: HTTP Error 404: Not Found

In [None]:
# Импортируем необходимые библиотеки
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Скачиваем необходимые ресурсы NLTK для русского языка
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Загрузка данных
train_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/test.csv')

# Предобработка текста
def preprocess_text(text):
    # Удаление ссылок, хэштегов, упоминаний и специальных символов
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r"[^а-яА-ЯёЁ]", " ", text)  # Оставляем только русские буквы
    text = text.lower()  # Преобразуем в нижний регистр

    # Токенизация, удаление стоп-слов и стемминг
    tokens = word_tokenize(text, language='russian')
    stop_words = set(stopwords.words('russian'))
    stemmer = SnowballStemmer('russian')
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Применяем предобработку к столбцу "text"
train_data['clean_text'] = train_data['text'].apply(preprocess_text)
test_data['clean_text'] = test_data['text'].apply(preprocess_text)

# Векторизация текста с помощью TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(train_data['clean_text']).toarray()
X_test = vectorizer.transform(test_data['clean_text']).toarray()

# Кодирование меток классов
y_train = train_data['category']
y_test = test_data['category']

# Массив моделей
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', random_state=42))
]

# Функция для оценки модели
def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Визуализация матрицы ошибок
    labels = sorted(set(y_test))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix ({name})')
    plt.show()

# Тематическое моделирование (LDA)
def topic_modeling():
    tfidf_vectorizer = TfidfVectorizer(max_features=3000)
    X_tfidf = tfidf_vectorizer.fit_transform(train_data['clean_text'])

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X_tfidf)

    def display_topics(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f"Topic #{topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

# Анализ распределения категорий
def category_distribution():
    category_counts = train_data['category'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
    plt.title('Category Distribution')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

# Интерактивная визуализация распределения категорий
def interactive_category_distribution():
    category_counts = train_data['category'].value_counts().reset_index()
    category_counts.columns = ['Category', 'Count']
    fig = px.bar(category_counts, x='Category', y='Count', title='Category Distribution', color='Category')
    fig.show()

# Тестирование на новых данных
def predict_new_texts(new_texts, vectorizer, model):
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed).toarray()
    predictions = model.predict(new_texts_vectorized)
    return predictions

# Основной блок кода
if __name__ == "__main__":
    # Оценка каждой модели
    for name, model in models:
        print(f"Evaluating {name}...")
        evaluate_model(model, name, X_train, X_test, y_train, y_test)

    # Тематическое моделирование
    print("\nTopic Modeling (LDA):")
    topic_modeling()

    # Анализ распределения категорий
    print("\nCategory Distribution:")
    category_distribution()

    # Интерактивная визуализация распределения категорий
    print("\nInteractive Category Distribution:")
    interactive_category_distribution()

    # Тестирование на новых данных
    new_texts = [
        "Хоккей — это отличный вид спорта!",
        "Футболисты показали невероятную игру вчера.",
        "Гонщики выступили на высшем уровне в этом сезоне."
    ]
    svm_model = models[3][1]  # SVM модель
    predicted_categories = predict_new_texts(new_texts, vectorizer, svm_model)
    print("\nPredicted Categories for New Texts:")
    for text, category in zip(new_texts, predicted_categories):
        print(f"Text: \"{text}\" → Category: {category}")


### Возможные улучшения:
1. **Балансировка данных**: Если данные несбалансированы, можно применить методы oversampling или undersampling.
2. **Гиперпараметрическая настройка**: Использовать GridSearchCV или RandomizedSearchCV для настройки гиперпараметров моделей.
3. **Другие методы векторизации**: Попробовать использовать Word2Vec, GloVe или BERT для получения более качественных эмбеддингов.
4. **Ансамблирование моделей**: Использовать ансамбли моделей для улучшения качества предсказаний.
5. **Кросс-валидация**: Вместо простого разделения на train/test использовать кросс-валидацию для более надежной оценки моделей.
6. **Логирование и сохранение моделей**: Добавить логирование процесса обучения и сохранение обученных моделей для последующего использования.


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
import joblib
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import BertTokenizer, BertModel
import torch

# Скачиваем необходимые ресурсы NLTK для русского языка
nltk.download('punkt')
nltk.download('stopwords')

class TextClassifier:
    def __init__(self, vectorization_method='tfidf', balance_data_flag=True, log_file='model_training.log'):
        self.vectorization_method = vectorization_method
        self.balance_data_flag = balance_data_flag
        self.vectorizer = None
        self.models = [
            ('Naive Bayes', MultinomialNB()),
            ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
            ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('SVM', SVC(kernel='linear', random_state=42))
        ]
        self.best_models = {}
        self.ensemble_model = None
        self.log_file = log_file
        logging.basicConfig(filename=log_file, level=logging.INFO,
                            format='%(asctime)s - %(levelname)s - %(message)s')

    def preprocess_text(self, text):
        """Предобработка текста."""
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+|\#', '', text)
        text = re.sub(r"[^а-яА-ЯёЁ]", " ", text)
        text = text.lower()
        tokens = word_tokenize(text, language='russian')
        stop_words = set(stopwords.words('russian'))
        stemmer = SnowballStemmer('russian')
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return " ".join(tokens)

    def vectorize_text(self, texts, method='tfidf'):
        """Векторизация текста."""
        if method == 'tfidf':
            if not self.vectorizer:
                self.vectorizer = TfidfVectorizer(max_features=3000)
                return self.vectorizer.fit_transform(texts).toarray()
            else:
                return self.vectorizer.transform(texts).toarray()
        elif method == 'word2vec':
            return self.word2vec_vectorization(texts)
        elif method == 'bert':
            return self.bert_vectorization(texts)
        else:
            raise ValueError("Неверный метод векторизации. Выберите 'tfidf', 'word2vec' или 'bert'.")

    def word2vec_vectorization(self, texts, vector_size=100, window=5, min_count=1):
        """Векторизация с использованием Word2Vec."""
        tokenized_texts = [word_tokenize(text) for text in texts]
        word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count)
        vectors = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
                            or [np.zeros(vector_size)], axis=0) for words in tokenized_texts])
        return vectors

    def bert_vectorization(self, texts):
        """Векторизация с использованием BERT."""
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertModel.from_pretrained('bert-base-multilingual-cased')
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings

    def balance_data(self, X, y):
        """Балансировка данных с использованием SMOTE."""
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        return X_balanced, y_balanced

    def hyperparameter_tuning(self, model, param_grid, X, y):
        """Гиперпараметрическая настройка с использованием GridSearchCV."""
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        logging.info(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
        logging.info(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
        return grid_search.best_estimator_

    def train_models(self, X_train, y_train):
        """Обучение и настройка моделей."""
        for name, model in self.models:
            if name == 'Logistic Regression':
                param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            elif name == 'Random Forest':
                param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            else:
                best_model = model
            self.best_models[name] = best_model

    def ensemble_models(self, X_train, y_train):
        """Ансамблирование моделей."""
        estimators = [(name, model) for name, model in self.best_models.items()]
        self.ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
        self.ensemble_model.fit(X_train, y_train)

    def evaluate_model(self, model, name, X_test, y_test):
        """Оценка модели."""
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logging.info(f"{name} Accuracy: {accuracy:.4f}")
        logging.info("Confusion Matrix:")
        logging.info(confusion_matrix(y_test, y_pred))
        logging.info("Classification Report:")
        logging.info(classification_report(y_test, y_pred))
        return accuracy

    def save_model(self, model, filename):
        """Сохранение модели."""
        joblib.dump(model, filename)
        logging.info(f"Model saved to {filename}")

    def run(self, train_data, test_data):
        """Основной метод для запуска классификации."""
        # Предобработка текста
        train_data['clean_text'] = train_data['text'].apply(self.preprocess_text)
        test_data['clean_text'] = test_data['text'].apply(self.preprocess_text)

        # Векторизация текста
        X_train = self.vectorize_text(train_data['clean_text'], method=self.vectorization_method)
        X_test = self.vectorize_text(test_data['clean_text'], method=self.vectorization_method)
        y_train = train_data['category']
        y_test = test_data['category']

        # Балансировка данных
        if self.balance_data_flag:
            X_train, y_train = self.balance_data(X_train, y_train)

        # Обучение и настройка моделей
        self.train_models(X_train, y_train)

        # Ансамблирование моделей
        self.ensemble_models(X_train, y_train)

        # Оценка ансамбля
        ensemble_accuracy = self.evaluate_model(self.ensemble_model, "Ensemble Model", X_test, y_test)
        logging.info(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")

        # Сохранение лучшей модели
        self.save_model(self.ensemble_model, 'best_ensemble_model.pkl')

# Пример использования
if __name__ == "__main__":
    # Загрузка данных
    train_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/train.csv')
    test_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/test.csv')

    # Создание и запуск классификатора
    classifier = TextClassifier(vectorization_method='tfidf', balance_data_flag=True)
    classifier.run(train_data, test_data)

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
import joblib
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import BertTokenizer, BertModel
import torch
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

# Скачиваем необходимые ресурсы NLTK для русского языка
nltk.download('punkt')
nltk.download('stopwords')

class TextClassifier:
    def __init__(self, vectorization_method='tfidf', balance_data_flag=True, log_file='model_training.log'):
        self.vectorization_method = vectorization_method
        self.balance_data_flag = balance_data_flag
        self.vectorizer = None
        self.models = [
            ('Naive Bayes', MultinomialNB()),
            ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
            ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('SVM', SVC(kernel='linear', random_state=42))
        ]
        self.best_models = {}
        self.ensemble_model = None
        self.log_file = log_file
        logging.basicConfig(filename=log_file, level=logging.INFO,
                            format='%(asctime)s - %(levelname)s - %(message)s')

    def preprocess_text(self, text):
        """Предобработка текста."""
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+|\#', '', text)
        text = re.sub(r"[^а-яА-ЯёЁ]", " ", text)
        text = text.lower()
        tokens = word_tokenize(text, language='russian')
        stop_words = set(stopwords.words('russian'))
        stemmer = SnowballStemmer('russian')
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return " ".join(tokens)

    def vectorize_text(self, texts, method='tfidf'):
        """Векторизация текста."""
        if method == 'tfidf':
            if not self.vectorizer:
                self.vectorizer = TfidfVectorizer(max_features=3000)
                return self.vectorizer.fit_transform(texts).toarray()
            else:
                return self.vectorizer.transform(texts).toarray()
        elif method == 'word2vec':
            return self.word2vec_vectorization(texts)
        elif method == 'bert':
            return self.bert_vectorization(texts)
        else:
            raise ValueError("Неверный метод векторизации. Выберите 'tfidf', 'word2vec' или 'bert'.")

    def word2vec_vectorization(self, texts, vector_size=100, window=5, min_count=1):
        """Векторизация с использованием Word2Vec."""
        tokenized_texts = [word_tokenize(text) for text in texts]
        word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count)
        vectors = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
                            or [np.zeros(vector_size)], axis=0) for words in tokenized_texts])
        return vectors

    def bert_vectorization(self, texts):
        """Векторизация с использованием BERT."""
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertModel.from_pretrained('bert-base-multilingual-cased')
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings

    def balance_data(self, X, y):
        """Балансировка данных с использованием SMOTE."""
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        return X_balanced, y_balanced

    def hyperparameter_tuning(self, model, param_grid, X, y):
        """Гиперпараметрическая настройка с использованием GridSearchCV."""
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        logging.info(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
        logging.info(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
        return grid_search.best_estimator_

    def train_models(self, X_train, y_train):
        """Обучение и настройка моделей."""
        for name, model in self.models:
            if name == 'Logistic Regression':
                param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            elif name == 'Random Forest':
                param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            else:
                best_model = model
            self.best_models[name] = best_model

    def ensemble_models(self, X_train, y_train):
        """Ансамблирование моделей."""
        estimators = [(name, model) for name, model in self.best_models.items()]
        self.ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
        self.ensemble_model.fit(X_train, y_train)

    def evaluate_model(self, model, name, X_test, y_test):
        """Оценка модели."""
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logging.info(f"{name} Accuracy: {accuracy:.4f}")
        logging.info("Confusion Matrix:")
        logging.info(confusion_matrix(y_test, y_pred))
        logging.info("Classification Report:")
        logging.info(classification_report(y_test, y_pred))
        return accuracy

    def save_model(self, model, filename):
        """Сохранение модели."""
        joblib.dump(model, filename)
        logging.info(f"Model saved to {filename}")

    def topic_modeling(self, train_data):
        """Тематическое моделирование (LDA)."""
        tfidf_vectorizer = TfidfVectorizer(max_features=3000)
        X_tfidf = tfidf_vectorizer.fit_transform(train_data['clean_text'])
        lda = LatentDirichletAllocation(n_components=5, random_state=42)
        lda.fit(X_tfidf)

        def display_topics(model, feature_names, n_top_words):
            for topic_idx, topic in enumerate(model.components_):
                print(f"Topic #{topic_idx}:")
                print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

        display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

    def category_distribution(self, train_data):
        """Анализ распределения категорий."""
        category_counts = train_data['category'].value_counts()
        plt.figure(figsize=(10, 6))
        sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
        plt.title('Category Distribution')
        plt.xlabel('Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

    def interactive_category_distribution(self, train_data):
        """Интерактивная визуализация распределения категорий."""
        category_counts = train_data['category'].value_counts().reset_index()
        category_counts.columns = ['Category', 'Count']
        fig = px.bar(category_counts, x='Category', y='Count', title='Category Distribution', color='Category')
        fig.show()

    def predict_new_texts(self, new_texts):
        """Тестирование на новых данных."""
        new_texts_preprocessed = [self.preprocess_text(text) for text in new_texts]
        new_texts_vectorized = self.vectorizer.transform(new_texts_preprocessed).toarray()
        predictions = self.ensemble_model.predict(new_texts_vectorized)
        return predictions

    def plot_learning_curve(self, model, X, y, name):
        """Функция для построения графиков обучения."""
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10)
        )
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)

        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, label='Training Accuracy', color='blue', marker='o')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')
        plt.plot(train_sizes, test_mean, label='Validation Accuracy', color='green', marker='s')
        plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='green')
        plt.title(f'Learning Curve ({name})')
        plt.xlabel('Training Set Size')
        plt.ylabel('Accuracy')
        plt.legend(loc='best')
        plt.grid()
        plt.show()

    def plot_tsne(self, X, y):
        """Построение облака точек с помощью t-SNE."""
        tsne = TSNE(n_components=2, random_state=42)
        X_tsne = tsne.fit_transform(X)

        le = LabelEncoder()
        y_encoded = le.fit_transform(y)

        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_encoded, cmap='viridis', alpha=0.7)
        plt.title('t-SNE Visualization of Data')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.colorbar(scatter, ticks=range(len(le.classes_)), label='Category')
        plt.show()

    def run(self, train_data, test_data):
        """Основной метод для запуска классификации."""
        # Предобработка текста
        train_data['clean_text'] = train_data['text'].apply(self.preprocess_text)
        test_data['clean_text'] = test_data['text'].apply(self.preprocess_text)

        # Векторизация текста
        X_train = self.vectorize_text(train_data['clean_text'], method=self.vectorization_method)
        X_test = self.vectorize_text(test_data['clean_text'], method=self.vectorization_method)
        y_train = train_data['category']
        y_test = test_data['category']

        # Балансировка данных
        if self.balance_data_flag:
            X_train, y_train = self.balance_data(X_train, y_train)

        # Обучение и настройка моделей
        self.train_models(X_train, y_train)

        # Ансамблирование моделей
        self.ensemble_models(X_train, y_train)

        # Оценка ансамбля
        ensemble_accuracy = self.evaluate_model(self.ensemble_model, "Ensemble Model", X_test, y_test)
        logging.info(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")

        # Сохранение лучшей модели
        self.save_model(self.ensemble_model, 'best_ensemble_model.pkl')

        # Дополнительные анализы
        self.topic_modeling(train_data)
        self.category_distribution(train_data)
        self.interactive_category_distribution(train_data)
        self.plot_learning_curve(self.ensemble_model, X_train, y_train, "Ensemble Model")
        self.plot_tsne(X_train, y_train)

# Пример использования
if __name__ == "__main__":
    # Загрузка данных
    train_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/train.csv')
    test_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/test.csv')

    # Создание и запуск классификатора
    classifier = TextClassifier(vectorization_method='tfidf', balance_data_flag=True)
    classifier.run(train_data, test_data)

In [1]:
class HTMLReportGenerator:
    def __init__(self, results):
        """
        Инициализация генератора HTML-отчета.

        :param results: Словарь с результатами анализа.
        """
        self.results = results

    def create_html_report(self, output_path="report.html"):
        """
        Создание HTML-отчета на основе результатов анализа.

        :param output_path: Путь для сохранения HTML-файла.
        """
        # Начало HTML-документа
        html = """
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Text Classification Report</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                h1, h2, h3 { color: #333; }
                table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }
                th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                th { background-color: #f4f4f4; }
                img { max-width: 100%; height: auto; }
                .section { margin-bottom: 20px; }
            </style>
        </head>
        <body>
        """

        # Заголовок отчета
        html += "<h1>Text Classification Report</h1>\n"

        # Таблица метрик моделей
        html += "<div class='section'>\n"
        html += "  <h2>Model Metrics</h2>\n"
        html += "  <table>\n"
        html += "    <tr><th>Model Name</th><th>Accuracy</th></tr>\n"
        for metric in self.results.get('model_metrics', []):
            html += f"    <tr><td>{metric['model_name']}</td><td>{metric['accuracy']}</td></tr>\n"
        html += "  </table>\n"
        html += "</div>\n"

        # Графики и оценки для каждой модели
        for name, lc in self.results.get('learning_curves', {}).items():
            html += f"<div class='section'>\n"
            html += f"  <h3>Learning Curve for {name}</h3>\n"
            html += f"  <img src='data:image/png;base64,{lc}' alt='{name} Learning Curve'>\n"
            html += "</div>\n"

        for name, cm in self.results.get('confusion_matrices', {}).items():
            html += f"<div class='section'>\n"
            html += f"  <h3>Confusion Matrix for {name}</h3>\n"
            html += f"  <img src='data:image/png;base64,{cm}' alt='{name} Confusion Matrix'>\n"
            html += "</div>\n"

        # Наилучшая модель
        best_model_metrics = self.results.get('best_model_metrics', {})
        if best_model_metrics:
            html += "<div class='section'>\n"
            html += "  <h2>Best Model</h2>\n"
            html += f"  <p><strong>Model Name:</strong> {best_model_metrics.get('model_name', 'N/A')}</p>\n"
            html += f"  <p><strong>Accuracy:</strong> {best_model_metrics.get('accuracy', 'N/A')}</p>\n"
            html += "</div>\n"

        # Кривая обучения для наилучшей модели
        best_model_learning_curve = self.results.get('best_model_learning_curve', '')
        if best_model_learning_curve:
            html += "<div class='section'>\n"
            html += "  <h3>Learning Curve for Best Model</h3>\n"
            html += f"  <img src='data:image/png;base64,{best_model_learning_curve}' alt='Best Model Learning Curve'>\n"
            html += "</div>\n"

        # Матрица путаницы для наилучшей модели
        best_model_confusion_matrix = self.results.get('best_model_confusion_matrix', '')
        if best_model_confusion_matrix:
            html += "<div class='section'>\n"
            html += "  <h3>Confusion Matrix for Best Model</h3>\n"
            html += f"  <img src='data:image/png;base64,{best_model_confusion_matrix}' alt='Best Model Confusion Matrix'>\n"
            html += "</div>\n"

        # Конец HTML-документа
        html += "</body>\n</html>"

        # Сохранение HTML-файла
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)

        print(f"HTML report generated: {output_path}")

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
import joblib
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from io import BytesIO
import base64
from jinja2 import Environment, FileSystemLoader

# Скачиваем необходимые ресурсы NLTK для русского языка
nltk.download('punkt')
nltk.download('stopwords')

class TextClassifier:
    def __init__(self, vectorization_method='tfidf', balance_data_flag=True, log_file='model_training.log'):
        self.vectorization_method = vectorization_method
        self.balance_data_flag = balance_data_flag
        self.vectorizer = None
        self.models = [
            ('Naive Bayes', MultinomialNB()),
            ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
            ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('SVM', SVC(kernel='linear', random_state=42))
        ]
        self.best_models = {}
        self.ensemble_model = None
        self.results = {}  # Словарь для хранения всех результатов
        self.log_file = log_file
        logging.basicConfig(filename=log_file, level=logging.INFO,
                            format='%(asctime)s - %(levelname)s - %(message)s')

    # Предобработка текста
    def preprocess_text(self, text):
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+|\#', '', text)
        text = re.sub(r"[^а-яА-ЯёЁ]", " ", text)
        text = text.lower()
        tokens = word_tokenize(text, language='russian')
        stop_words = set(stopwords.words('russian'))
        stemmer = SnowballStemmer('russian')
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return " ".join(tokens)

    # Векторизация текста
    def vectorize_text(self, texts, method='tfidf'):
        if method == 'tfidf':
            if not self.vectorizer:
                self.vectorizer = TfidfVectorizer(max_features=3000)
                return self.vectorizer.fit_transform(texts).toarray()
            else:
                return self.vectorizer.transform(texts).toarray()
        elif method == 'word2vec':
            return self.word2vec_vectorization(texts)
        elif method == 'bert':
            return self.bert_vectorization(texts)
        else:
            raise ValueError("Неверный метод векторизации. Выберите 'tfidf', 'word2vec' или 'bert'.")

    # Векторизация с помощью Word2Vec
    def word2vec_vectorization(self, texts, vector_size=100, window=5, min_count=1):
        tokenized_texts = [word_tokenize(text) for text in texts]
        word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count)
        vectors = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
                            or [np.zeros(vector_size)], axis=0) for words in tokenized_texts])
        return vectors

    # Векторизация с помощью BERT
    def bert_vectorization(self, texts):
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertModel.from_pretrained('bert-base-multilingual-cased')
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings

    # Балансировка данных
    def balance_data(self, X, y):
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        return X_balanced, y_balanced

    # Гиперпараметрическая настройка
    def hyperparameter_tuning(self, model, param_grid, X, y):
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        logging.info(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
        logging.info(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
        return grid_search.best_estimator_

    # Обучение моделей
    def train_models(self, X_train, y_train):
        for name, model in self.models:
            if name == 'Logistic Regression':
                param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            elif name == 'Random Forest':
                param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
                best_model = self.hyperparameter_tuning(model, param_grid, X_train, y_train)
            else:
                best_model = model
            self.best_models[name] = best_model

    # Ансамблирование моделей
    def ensemble_models(self, X_train, y_train):
        estimators = [(name, model) for name, model in self.best_models.items()]
        self.ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
        self.ensemble_model.fit(X_train, y_train)

    # Оценка модели
    def evaluate_model(self, model, name, X_test, y_test):
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logging.info(f"{name} Accuracy: {accuracy:.4f}")
        logging.info("Confusion Matrix:")
        logging.info(confusion_matrix(y_test, y_pred))
        logging.info("Classification Report:")
        logging.info(classification_report(y_test, y_pred))
        return accuracy

    # Сохранение модели
    def save_model(self, model, filename):
        joblib.dump(model, filename)
        logging.info(f"Model saved to {filename}")

    # Тематическое моделирование (LDA)
    def topic_modeling(self, train_data):
        tfidf_vectorizer = TfidfVectorizer(max_features=3000)
        X_tfidf = tfidf_vectorizer.fit_transform(train_data['clean_text'])
        lda = LatentDirichletAllocation(n_components=5, random_state=42)
        lda.fit(X_tfidf)

        def display_topics(model, feature_names, n_top_words):
            topics = []
            for topic_idx, topic in enumerate(model.components_):
                top_words = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
                topics.append(f"Topic #{topic_idx}: {top_words}")
            return topics

        topics = display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)
        return topics

    # Анализ распределения категорий
    def category_distribution(self, train_data):
        category_counts = train_data['category'].value_counts()
        plt.figure(figsize=(10, 6))
        sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
        plt.title('Category Distribution')
        plt.xlabel('Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        buffer = BytesIO()
        plt.savefig(buffer, format='png')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
        buffer.close()
        plt.close()
        return image_base64

    # Интерактивная визуализация распределения категорий
    def interactive_category_distribution(self, train_data):
        category_counts = train_data['category'].value_counts().reset_index()
        category_counts.columns = ['Category', 'Count']
        fig = px.bar(category_counts, x='Category', y='Count', title='Category Distribution', color='Category')
        return fig.to_html(full_html=False)

    # Тестирование на новых данных
    def predict_new_texts(self, new_texts):
        new_texts_preprocessed = [self.preprocess_text(text) for text in new_texts]
        new_texts_vectorized = self.vectorizer.transform(new_texts_preprocessed).toarray()
        predictions = self.ensemble_model.predict(new_texts_vectorized)
        return predictions

    # Графики обучения
    def plot_learning_curve(self, model, X, y, name):
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10)
        )
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)

        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, label='Training Accuracy', color='blue', marker='o')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')
        plt.plot(train_sizes, test_mean, label='Validation Accuracy', color='green', marker='s')
        plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='green')
        plt.title(f'Learning Curve ({name})')
        plt.xlabel('Training Set Size')
        plt.ylabel('Accuracy')
        plt.legend(loc='best')
        plt.grid()

        buffer = BytesIO()
        plt.savefig(buffer, format='png')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
        buffer.close()
        plt.close()
        return image_base64

    # Облако точек (t-SNE)
    def plot_tsne(self, X, y):
        tsne = TSNE(n_components=2, random_state=42)
        X_tsne = tsne.fit_transform(X)

        le = LabelEncoder()
        y_encoded = le.fit_transform(y)

        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_encoded, cmap='viridis', alpha=0.7)
        plt.title('t-SNE Visualization of Data')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.colorbar(scatter, ticks=range(len(le.classes_)), label='Category')

        buffer = BytesIO()
        plt.savefig(buffer, format='png')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
        buffer.close()
        plt.close()
        return image_base64

    # Кластеризация текстов
    def cluster_texts(self, X, method='kmeans', n_clusters=5):
        if method == 'kmeans':
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            clusters = kmeans.fit_predict(X)
        elif method == 'dbscan':
            dbscan = DBSCAN(eps=0.5, min_samples=5)
            clusters = dbscan.fit_predict(X)
        else:
            raise ValueError("Unsupported clustering method. Choose 'kmeans' or 'dbscan'.")
        return clusters

    # Временной анализ
    def temporal_analysis(self, data, date_col='date', freq='M'):
        data[date_col] = pd.to_datetime(data[date_col])
        data.set_index(date_col, inplace=True)
        trends = data.resample(freq).size()

        plt.figure(figsize=(10, 6))
        trends.plot(title=f"Temporal Trends ({freq})", figsize=(10, 6))
        plt.xlabel('Date')
        plt.ylabel('Count')

        buffer = BytesIO()
        plt.savefig(buffer, format='png')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
        buffer.close()
        plt.close()
        return image_base64

    # Анализ настроений с использованием BERT
    def sentiment_analysis_with_bert(self, texts):
        tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
        model = BertModel.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        sentiment_scores = outputs.last_hidden_state.mean(dim=1).numpy()

        sentiments = []
        for score in sentiment_scores:
            if score.mean() > 0.5:
                sentiments.append('positive')
            elif score.mean() < -0.5:
                sentiments.append('negative')
            else:
                sentiments.append('neutral')
        return sentiments

    # Добавление дополнительных признаков для анализа настроений
    def add_sentiment_features(self, data):
        data['exclamation_count'] = data['text'].apply(lambda x: x.count('!'))
        data['question_count'] = data['text'].apply(lambda x: x.count('?'))
        data['uppercase_ratio'] = data['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
        return data

    # Многомерный анализ (PCA)
    def plot_pca(self, X, y):
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        le = LabelEncoder()
        y_encoded = le.fit_transform(y)

        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_encoded, cmap='viridis', alpha=0.7)
        plt.title('PCA Visualization of Data')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.colorbar(scatter, ticks=range(len(le.classes_)), label='Category')

        buffer = BytesIO()
        plt.savefig(buffer, format='png')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
        buffer.close()
        plt.close()
        return image_base64

    # Основной метод для запуска классификации
    def run(self, train_data, test_data):
        results = {}

        # Предобработка текста
        train_data['clean_text'] = train_data['text'].apply(self.preprocess_text)
        test_data['clean_text'] = test_data['text'].apply(self.preprocess_text)

        # Векторизация текста
        X_train = self.vectorize_text(train_data['clean_text'], method=self.vectorization_method)
        X_test = self.vectorize_text(test_data['clean_text'], method=self.vectorization_method)
        y_train = train_data['category']
        y_test = test_data['category']

        # Балансировка данных
        if self.balance_data_flag:
            X_train, y_train = self.balance_data(X_train, y_train)

        # Обучение и настройка моделей
        self.train_models(X_train, y_train)

        # Ансамблирование моделей
        self.ensemble_models(X_train, y_train)

        # Оценка ансамбля
        ensemble_accuracy = self.evaluate_model(self.ensemble_model, "Ensemble Model", X_test, y_test)
        results['ensemble_accuracy'] = ensemble_accuracy

        # Сохранение лучшей модели
        self.save_model(self.ensemble_model, 'best_ensemble_model.pkl')

        # Тематическое моделирование
        topics = self.topic_modeling(train_data)
        results['topics'] = topics

        # Распределение категорий
        category_distribution_image = self.category_distribution(train_data)
        results['category_distribution_image'] = category_distribution_image

        # Интерактивная визуализация распределения категорий
        interactive_category_distribution_html = self.interactive_category_distribution(train_data)
        results['interactive_category_distribution_html'] = interactive_category_distribution_html

        # График обучения
        learning_curve_image = self.plot_learning_curve(self.ensemble_model, X_train, y_train, "Ensemble Model")
        results['learning_curve_image'] = learning_curve_image

        # t-SNE визуализация
        tsne_image = self.plot_tsne(X_train, y_train)
        results['tsne_image'] = tsne_image

        # Кластеризация текстов
        clusters = self.cluster_texts(X_train, method='kmeans', n_clusters=5)
        results['clusters'] = clusters.tolist()

        # Временной анализ
        if 'date' in train_data.columns:
            temporal_analysis_image = self.temporal_analysis(train_data, date_col='date', freq='M')
            results['temporal_analysis_image'] = temporal_analysis_image

        # Анализ настроений
        sentiments = self.sentiment_analysis_with_bert(train_data['clean_text'].tolist())
        results['sentiments'] = sentiments

        # Многомерный анализ (PCA)
        pca_image = self.plot_pca(X_train, y_train)
        results['pca_image'] = pca_image

        # Сохраняем результаты
        self.results = results

    def generate_html_report(self, output_path="report.html"):
        """
        Генерация HTML-отчета на основе сохраненных результатов.

        :param output_path: Путь для сохранения HTML-файла.
        """
        # Создаем экземпляр генератора отчета
        report_generator = HTMLReportGenerator(self.results)
        # Генерируем отчет
        report_generator.create_html_report(output_path)

In [None]:
if __name__ == "__main__":
    # Загрузка данных
    train_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/train.csv')
    test_data = pd.read_csv('/kaggle/input/russian-social-media-text-classification/test.csv')

    # Создание и запуск классификатора
    classifier = TextClassifier(vectorization_method='tfidf', balance_data_flag=True)
    classifier.run(train_data, test_data)

    # Генерация HTML-отчета
    classifier.generate_html_report(output_path="report.html")