<a href="https://colab.research.google.com/github/CodeHunterOfficial/ABC_DataMining/blob/main/NM/%D0%9B%D0%B5%D0%BC%D0%B0%D1%82%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F_%D0%92%D0%B5%D0%BA%D1%82%D0%BE%D1%80%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
!pip install pymorphy2 langdetect natasha



In [52]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize
from pymorphy2 import MorphAnalyzer
from transformers import AutoTokenizer
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from langdetect import detect, LangDetectException
import spacy
import json
import os
from natasha import MorphVocab
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

class TextProcessor:
    def __init__(self, text, language=None):
        self.text = text
        self.language = language if language else self.detect_language()
        self.morph_vocab = MorphVocab() if self.language == 'ru' else None
        self.morph_analyzer = MorphAnalyzer() if self.language == 'ru' else None
        self.stop_words = set(stopwords.words(self.language)) if self.language in stopwords.fileids() else set()
        self.nlp = self._load_spacy_model() if self.language in {'en', 'de'} else None
        self.results = []  # Для хранения результатов методов

    def _load_spacy_model(self):
        model_map = {
            'en': "en_core_web_sm",
            'de': "de_core_news_sm"
        }

        if self.language in model_map:
            model_name = model_map[self.language]
            if not spacy.util.is_package(model_name):
                raise NotImplementedError(f"spaCy model '{model_name}' is not installed. Please install it using 'python -m spacy download {model_name}'.")
            try:
                return spacy.load(model_name)
            except OSError:
                raise NotImplementedError(f"spaCy model for language '{self.language}' is not available.")
        else:
            print(f"spaCy does not support the language '{self.language}'. Using alternative NLP tools.")
            return None

    def clean_text(self):
        cleaned_text = re.sub(r"http\S+", "", self.text)
        cleaned_text = re.sub(r"<.*?>", "", cleaned_text)
        cleaned_text = re.sub(r"[^a-zA-Zа-яА-ЯёЁ0-9\s]", "", cleaned_text)
        return cleaned_text.strip()

    def remove_stopwords(self, tokens=None):
        if tokens is None:
            tokens = self.standard_tokenization()
        return [token for token in tokens if token.lower() not in self.stop_words]

    def simple_tokenization(self):
        tokens = self.clean_text().split()
        self._save_result("Simple Tokenization", tokens)
        return tokens

    def standard_tokenization(self):
        tokens = word_tokenize(self.clean_text())
        self._save_result("Standard Tokenization", tokens)
        return tokens

    def character_tokenization(self):
        tokens = list(self.clean_text())
        self._save_result("Character Tokenization", tokens)
        return tokens

    def byte_level_tokenization(self):
        tokens = [byte for byte in self.clean_text().encode('utf-8')]
        self._save_result("Byte-Level Tokenization", tokens)
        return tokens

    def zipf_law_analysis(self, save_path="zipf_law.png"):
        """
        Выполняет анализ закона Ципфа и сохраняет график в файл.
        """
        tokens = self.remove_stopwords(self.standard_tokenization())
        freq_dist = Counter(tokens)
        frequencies = sorted(freq_dist.values(), reverse=True)
        ranks = range(1, len(frequencies) + 1)

        plt.figure(figsize=(10, 6))
        plt.plot(ranks, frequencies, marker='o')
        plt.xscale('log')
        plt.yscale('log')
        plt.xlabel('Rank')
        plt.ylabel('Frequency')
        plt.title("Zipf's Law Analysis")

        # Сохраняем график в файл
        plt.savefig(save_path)
        plt.close()

        # Сохраняем результат
        self._save_result("Zipf's Law Analysis", {"image": save_path})
        return save_path

    def visualize_word_frequencies(self, save_path="word_frequencies.png"):
        """
        Визуализирует частоты слов в виде облака слов и сохраняет изображение в файл.
        """
        tokens = self.remove_stopwords(self.standard_tokenization())
        freq_dist = Counter(tokens)
        wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(freq_dist)

        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title("Word Frequencies Visualization")

        # Сохраняем график в файл
        plt.savefig(save_path)
        plt.close()

        # Сохраняем результат
        self._save_result("Word Frequencies Visualization", {"image": save_path})
        return save_path

    def rule_based_tokenization(self, pattern=r'\w+'):
        tokens = regexp_tokenize(self.clean_text(), pattern)
        self._save_result("Rule-Based Tokenization", tokens)
        return tokens

    def stemming(self):
        """
        Выполняет стемминг текста с использованием SnowballStemmer.
        Для русского языка используется лемматизация через pymorphy2,
        так как стемминг для русского языка менее эффективен.
        """
        if self.language == 'ru':
            # Для русского языка используем лемматизацию вместо стемминга
            print("Stemming is not recommended for Russian. Using lemmatization instead.")
            tokens = self.remove_stopwords(self.standard_tokenization())
            lemmas = [self.morph_analyzer.parse(token)[0].normal_form for token in tokens]
            self._save_result("Lemmatization", lemmas)
            return lemmas

        elif self.language in SnowballStemmer.languages:
            # Для остальных языков, поддерживаемых SnowballStemmer, выполняем стемминг
            stemmer = SnowballStemmer(self.language)
            tokens = self.remove_stopwords(self.standard_tokenization())
            stemmed_tokens = [stemmer.stem(token) for token in tokens]
            self._save_result("Stemming", stemmed_tokens)
            return stemmed_tokens

        #else:
            # Если язык не поддерживается, выбрасываем исключение
            #raise ValueError(f"Stemming is not supported for language '{self.language}'.")

    def lemmatize(self, cleaned_text):
        if self.language == 'ru':
            words = word_tokenize(cleaned_text)
            lemmas = [self.morph_analyzer.parse(word)[0].normal_form for word in words]
            self._save_result("Lemmatization", lemmas)
            return lemmas
        elif self.nlp:
            doc = self.nlp(cleaned_text)
            lemmas = [token.lemma_ for token in doc]
            self._save_result("Lemmatization", lemmas)
            return lemmas
        else:
            raise NotImplementedError(f"Лемматизация для языка '{self.language}' не поддерживается.")

    def universal_lemmatization(self):
        """
        Универсальный метод лемматизации, который выбирает подходящий инструмент в зависимости от языка.
        """
        if self.language == 'ru':
            tokens = self.remove_stopwords(self.standard_tokenization())
            lemmatized_tokens = [(token, self.morph_analyzer.parse(token)[0].normal_form) for token in tokens]
        elif self.language in {'en', 'de'}:
            doc = self.nlp(self.clean_text())
            lemmatized_tokens = [(token.text, token.lemma_) for token in doc if token.text.lower() not in self.stop_words]
        else:
            raise NotImplementedError(f"Lemmatization is not implemented for language '{self.language}'.")

        self._save_result("Universal Lemmatization", lemmatized_tokens)
        return lemmatized_tokens

    def morphological_analysis(self):
        if self.language == 'ru':
            tokens = self.remove_stopwords(self.standard_tokenization())
            morph_tokens = [(token, self.morph_analyzer.parse(token)[0].normal_form) for token in tokens]
        elif self.language in {'en', 'de'}:
            doc = self.nlp(self.clean_text())
            morph_tokens = [(token.text, token.morph) for token in doc if token.text.lower() not in self.stop_words]
        else:
            raise NotImplementedError(f"Morphological analysis is not implemented for language '{self.language}'.")

        self._save_result("Morphological Analysis", morph_tokens)
        return morph_tokens

    def subword_tokenization(self, model_name="bert-base-uncased"):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokens = tokenizer.tokenize(self.clean_text())
        self._save_result("Subword Tokenization", tokens)
        return tokens

    def neural_tokenization(self, model_name="bert-base-uncased"):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        token_ids = tokenizer.encode(self.clean_text(), add_special_tokens=True)
        self._save_result("Neural Tokenization", token_ids)
        return token_ids

    def hybrid_tokenization(self):
        rule_tokens = self.rule_based_tokenization()
        if self.language in SnowballStemmer.languages:
            stemmer = SnowballStemmer(self.language)
            tokens = [stemmer.stem(token) for token in rule_tokens]
        else:
            tokens = rule_tokens
        self._save_result("Hybrid Tokenization", tokens)
        return tokens

    def language_specific_tokenization(self):
        if self.language == 'en':
            tokens = self.standard_tokenization()
        elif self.language == 'ru':
            tokens = self.morphological_analysis()
        elif self.language == 'de':
            if not self.nlp:
                raise NotImplementedError(f"Language-specific tokenization is not implemented for language '{self.language}'.")
            doc = self.nlp(self.clean_text())
            tokens = [token.text for token in doc]
        else:
            raise NotImplementedError(f"Language '{self.language}' not supported yet.")

        self._save_result("Language-Specific Tokenization", tokens)
        return tokens

    def detect_language(self):
        try:
            return detect(self.text)
        except LangDetectException:
            return "Unknown"

    def generate_ngrams(self, n=2):
        tokens = self.remove_stopwords(self.standard_tokenization())
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams = [" ".join(ngram) for ngram in ngrams]
        self._save_result(f"{n}-grams", ngrams)
        return ngrams

    def save_tokens_to_file(self, filename="tokens.json"):
        tokens = self.remove_stopwords(self.standard_tokenization())
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(tokens, f, ensure_ascii=False, indent=4)

    def load_tokens_from_file(self, filename="tokens.json"):
        with open(filename, "r", encoding="utf-8") as f:
            return json.load(f)

    def evaluate_tokenization(self, gold_standard_tokens):
        predicted_tokens = self.remove_stopwords(self.standard_tokenization())
        correct = len(set(predicted_tokens) & set(gold_standard_tokens))
        precision = correct / len(predicted_tokens) if predicted_tokens else 0
        recall = correct / len(gold_standard_tokens) if gold_standard_tokens else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        result = {"precision": precision, "recall": recall, "f1_score": f1_score}
        self._save_result("Tokenization Evaluation", result)
        return result

    def _save_result(self, method_name, result):
        """
        Сохраняет результат выполнения метода в список результатов.
        """
        self.results.append({
            "method": method_name,
            "text": self.text,
            "result": result
        })

    def execute_all_methods(self):
        """
        Выполняет все методы класса TextProcessor и сохраняет их результаты.
        """
        methods_to_execute = [
            ("Simple Tokenization", self.simple_tokenization),
            ("Standard Tokenization", self.standard_tokenization),
            ("Character Tokenization", self.character_tokenization),
            ("Byte-Level Tokenization", self.byte_level_tokenization),
            ("Rule-Based Tokenization", lambda: self.rule_based_tokenization(pattern=r'\w+')),
            ("Stemming", self.stemming),
            ("Lemmatization", lambda: self.lemmatize(self.clean_text())),
            ("Universal Lemmatization", self.universal_lemmatization),
            ("Morphological Analysis", self.morphological_analysis),
            ("Subword Tokenization", lambda: self.subword_tokenization(model_name="bert-base-uncased")),
            ("Neural Tokenization", lambda: self.neural_tokenization(model_name="bert-base-uncased")),
            ("Hybrid Tokenization", self.hybrid_tokenization),
            ("Language-Specific Tokenization", self.language_specific_tokenization),
            ("2-grams", lambda: self.generate_ngrams(n=2)),
            ("3-grams", lambda: self.generate_ngrams(n=3)),
            ("Zipf's Law Analysis", lambda: self.zipf_law_analysis(save_path=f"{self.language}_zipf_law.png")),
            ("Word Frequencies Visualization", lambda: self.visualize_word_frequencies(save_path=f"{self.language}_word_frequencies.png"))
        ]

        for method_name, method in methods_to_execute:
            try:
                if method_name in {"Zipf's Law Analysis", "Word Frequencies Visualization"}:
                    # Для методов, которые только рисуют графики, не сохраняем результат
                    method()
                else:
                    result = method()
                    self._save_result(method_name, result)
            except Exception as e:
                print(f"Ошибка при выполнении метода {method_name}: {e}")


    def _save_result(self, method_name, result):
            """
            Сохраняет результат выполнения метода в список результатов.
            """
            self.results.append({
                "method": method_name,
                "text": self.text,
                "result": result,
                "language": self.language  # Добавляем информацию о языке
            })

    def get_results(self):
        """
        Возвращает все сохраненные результаты.
        """
        return self.results

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [53]:
class HTMLReportGenerator:
    def __init__(self, results_list):
        """
        Инициализирует генератор HTML-отчетов.

        :param results_list: Список списков результатов из разных TextProcessor'ов.
        """
        self.results = []
        seen_results = set()  # Для отслеживания уникальности записей

        for results in results_list:
            for result in results:
                # Добавляем столбец Language
                result_with_language = {
                    "language": result.get("language", "Unknown"),
                    "method": result["method"],
                    "text": result["text"],
                    "result": result["result"]
                }

                # Проверяем уникальность записи
                result_key = (result_with_language["language"], result_with_language["method"], result_with_language["text"])
                if result_key not in seen_results:
                    self.results.append(result_with_language)
                    seen_results.add(result_key)

    def _create_html_content(self):
        """
        Создает содержимое HTML-документа на основе результатов.
        """
        html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Combined Text Processing Report</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
        }
        h1 {
            color: #333;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        th {
            background-color: #f4f4f4;
        }
        img {
            max-width: 100%;
            height: auto;
        }
    </style>
</head>
<body>
    <h1>Combined Text Processing Report</h1>
    <table>
        <tr>
            <th>Language</th>
            <th>Method</th>
            <th>Text</th>
            <th>Result</th>
        </tr>
"""
        for result in self.results:
            language = result["language"]
            method = result["method"]
            text = result["text"].replace("\n", "<br>")  # Обрабатываем переносы строк
            res = result["result"]

            if isinstance(res, dict) and "image" in res:  # Если результат содержит изображение
                image_path = res["image"]
                res_html = f'<img src="{os.path.basename(image_path)}" alt="{method}">'
            else:  # Иначе просто выводим текстовый результат
                res_html = str(res).replace("\n", "<br>")

            html_content += f"""
        <tr>
            <td>{language}</td>
            <td>{method}</td>
            <td>{text}</td>
            <td>{res_html}</td>
        </tr>
"""

        html_content += """
    </table>
</body>
</html>
"""
        return html_content

    def generate_report(self, output_filename="combined_report.html"):
        """
        Генерирует и сохраняет HTML-отчет в файл.

        :param output_filename: Имя файла для сохранения отчета.
        """
        html_content = self._create_html_content()

        # Определяем папку для сохранения изображений
        images_dir = "images"
        os.makedirs(images_dir, exist_ok=True)

        # Копируем все используемые изображения в папку images
        for result in self.results:
            if isinstance(result["result"], dict) and "image" in result["result"]:
                image_path = result["result"]["image"]
                new_image_path = os.path.join(images_dir, os.path.basename(image_path))
                os.replace(image_path, new_image_path)  # Перемещаем файл

        # Записываем HTML-контент в файл
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(html_content)

        print(f"Общий отчет успешно сгенерирован и сохранен в файле {output_filename}.")

In [54]:
if __name__ == "__main__":
    text_en = "Hello, world! This is an example of text processing in English."
    text_ru = "Привет, мир! Это пример обработки текста на русском языке."
    text_de = "Hallo Welt! Dies ist ein Beispiel für Textverarbeitung auf Deutsch."

    processor_en = TextProcessor(text_en, language='en')
    processor_ru = TextProcessor(text_ru, language='ru')
    processor_de = TextProcessor(text_de, language='de')

    # Выполнение всех методов для каждого процессора
    processor_en.execute_all_methods()
    processor_ru.execute_all_methods()
    processor_de.execute_all_methods()

    # Получение результатов для каждого процессора
    results_en = processor_en.get_results()
    results_ru = processor_ru.get_results()
    results_de = processor_de.get_results()

    # Объединение результатов
    all_results = [results_en, results_ru, results_de]

    # Создание и генерация общего HTML-отчета
    report_generator = HTMLReportGenerator(all_results)
    report_generator.generate_report("combined_text_processing_report.html")

Stemming is not recommended for Russian. Using lemmatization instead.
Общий отчет успешно сгенерирован и сохранен в файле combined_text_processing_report.html.
