# Проект по тематическому моделированию: выявление ключевых тем и их героев в романах Терри Пратчетта

In [None]:
!pip install bertopic sentence-transformers umap-learn hdbscan wordcloud plotly networkx spacy nltk seaborn tqdm

#### Импорт библиотек

In [45]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from google.colab import drive
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
from IPython.display import display, HTML

In [None]:
# Подключение к Google Drive
drive.mount('/content/drive')

##### Загрузка моделей

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [48]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [49]:
# Загрузка модели spaCy для NER
nlp = spacy.load('en_core_web_lg')

#### Настройка стилей для визуализации

In [50]:
plt.style.use('ggplot')
sns.set(style="whitegrid")

#### Путь к папке

In [51]:
books_dir = '/content/drive/MyDrive/final_project_pratchett_books'

####Список стоп-имен персонажей (простые обращения, мусор)

In [53]:

IGNORE_NAMES = set([
    'Mr', 'Mrs', 'Miss', 'Mister', 'Sir', 'Madam', 'Mistress', 'Lord', 'Lady',
    'Doctor', 'Dr', 'Captain', 'Policeman', 'Constable',
    'Father', 'Mother', 'Brother', 'Sister', 'Smith', 'Cook', 'said'
])

In [54]:
def clean_name(name):
    #Удаляем спецсимволы, кавычки, переносы, точки и лишние пробелы
    return re.sub(r'[^a-zA-Z\-\' ]+', '', name).strip().replace('\n','')

####Функции

In [55]:
# Функция для чтения и предварительной обработки книг
def read_books(directory):
    books = {}
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            book_name = os.path.splitext(filename)[0]
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
            books[book_name] = content
    return books

In [56]:
# Функция для разделения книги на главы
def split_into_chapters(text):
    # Шаблон для поиска заголовков глав
    chapter_pattern = re.compile(r'(Chapter \d+.*?|CHAPTER \d+.*?)\n', re.IGNORECASE)
    chapters = chapter_pattern.split(text)

    # Первый элемент может быть предисловием или вступлением
    chapters = chapters[1:] if len(chapters) > 1 else [text]

    # Группировка заголовков с соответствующим содержанием
    result = []
    for i in range(0, len(chapters), 2):
        if i+1 < len(chapters):
            chapter_title = chapters[i].strip()
            chapter_content = chapters[i+1].strip()
            result.append((chapter_title, chapter_content))

    return result

In [57]:
# Функция для разделения главы на сцены
def split_into_scenes(chapter_text):
    # Разделяем по пустым строкам или звездочкам/тире (часто используется для разделения сцен)
    scene_pattern = re.compile(r'\n\s*\n|\n\s*[*\-]{3,}\s*\n', re.MULTILINE)
    scenes = scene_pattern.split(chapter_text)
    return [scene.strip() for scene in scenes if scene.strip()]

In [58]:
# Функция для разделения текста на абзацы
def split_into_paragraphs(text):
    paragraphs = text.split('\n\n')
    return [p.strip() for p in paragraphs if p.strip()]

In [59]:
def extract_entities_batch(texts):
    persons = []
    docs = nlp.pipe(texts, batch_size=32, disable=["tagger", "parser"])
    for doc in docs:
        doc_persons = []
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = clean_name(ent.text)
                if len(name.split()) <= 3 and name and name not in IGNORE_NAMES:
                    doc_persons.append(name)
        persons.append(doc_persons)
    return persons

In [60]:
# Функция для извлечения именованных сущностей (NER)
def extract_entities(text):
    persons_list = extract_entities_batch([text])
    return persons_list[0]

In [61]:
# Функция для создания словаря персонажей из всех книг
def create_character_dictionary(books):
    all_characters = []
    CHUNK_SIZE = 150_000
    # Вместо всего текста обрабатываем по частям
    for book_name, content in books.items():
        for i in range(0, len(content), CHUNK_SIZE):
            chunk = content[i:i+CHUNK_SIZE]
            persons = extract_entities(chunk)
            all_characters.extend(persons)

    character_counts = Counter(all_characters)
    # Фильтрация для удаления редких имен (которые могут быть ошибками NER)
    main_characters = {char: count for char, count in character_counts.items() if count > 5}
    return main_characters

In [62]:
# Функция для создания корпуса абзацев
def create_paragraph_corpus(books):
    corpus = []
    metadata = []
    for book_name, content in tqdm(books.items()):
        chapters = split_into_chapters(content)
        for chapter_idx, (chapter_title, chapter_content) in enumerate(chapters):
            paragraphs = split_into_paragraphs(chapter_content)
            batch = []
            para_idxs = []
            for para_idx, paragraph in enumerate(paragraphs):
                if len(paragraph.split()) > 20:  # Минимальная длина абзаца
                    batch.append(paragraph)
                    para_idxs.append(para_idx)

                if len(batch) == 32:
                    persons_batch = extract_entities_batch(batch)
                    for b, person in zip(range(len(batch)), persons_batch):
                        corpus.append(batch[b])
                        metadata.append({
                            'book': book_name,
                            'chapter': chapter_idx,
                            'chapter_title': chapter_title,
                            'paragraph': para_idxs[b],
                            'characters': person
                        })
                    batch.clear()
                    para_idxs.clear()
            # Остаток в batch
            if batch:
                persons_batch = extract_entities_batch(batch)
                for b, person in zip(range(len(batch)), persons_batch):
                    corpus.append(batch[b])
                    metadata.append({
                        'book': book_name,
                        'chapter': chapter_idx,
                        'chapter_title': chapter_title,
                        'paragraph': para_idxs[b],
                        'characters': person
                    })
    return corpus, metadata

####Загрузка книг

In [63]:
# Загрузка книг
print("Загрузка книг...")
books = read_books(books_dir)
print(f"Загружено {len(books)} книг")

# Создание словаря персонажей
print("Создание словаря персонажей...")
character_dict = create_character_dictionary(books)
print(f"Найдено {len(character_dict)} основных персонажей")

# Создание корпуса абзацев
print("Создание корпуса абзацев...")
corpus, metadata = create_paragraph_corpus(books)
print(f"Создано {len(corpus)} абзацев для анализа")

# Сохранение корпуса в DataFrame
df = pd.DataFrame({
    'text': corpus,
    'book': [m['book'] for m in metadata],
    'chapter': [m['chapter'] for m in metadata],
    'chapter_title': [m['chapter_title'] for m in metadata],
    'paragraph': [m['paragraph'] for m in metadata],
    'characters': [m['characters'] for m in metadata]
})


Загрузка книг...
Загружено 19 книг
Создание словаря персонажей...
Найдено 584 основных персонажей
Создание корпуса абзацев...


  0%|          | 0/19 [00:00<?, ?it/s]

Создано 1787 абзацев для анализа


####Сохранение промежуточных результатов

In [64]:
df.to_csv('/content/drive/MyDrive/pratchett_analysis_corpus.csv', index=False)

####Создание и обучение модели BERTopic

In [65]:
# Настройка параметров модели
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=10, metric='euclidean', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

In [66]:
# Создание модели BERTopic с настроенными параметрами
topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    nr_topics="auto",
    top_n_words=20,
    calculate_probabilities=True,
    verbose=True
)

In [67]:
# Обучение модели на корпусе
topics, probs = topic_model.fit_transform(corpus)

2025-06-27 19:26:42,936 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/56 [00:00<?, ?it/s]

2025-06-27 19:27:42,407 - BERTopic - Embedding - Completed ✓
2025-06-27 19:27:42,409 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-27 19:27:50,236 - BERTopic - Dimensionality - Completed ✓
2025-06-27 19:27:50,239 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-27 19:27:50,542 - BERTopic - Cluster - Completed ✓
2025-06-27 19:27:50,543 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-27 19:27:50,914 - BERTopic - Representation - Completed ✓
2025-06-27 19:27:50,916 - BERTopic - Topic reduction - Reducing number of topics
2025-06-27 19:27:50,933 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-27 19:27:51,305 - BERTopic - Representation - Completed ✓
2025-06-27 19:27:51,309 - BERTopic - Topic reduction - Reduced number of topics from 21 to 21


In [68]:
# Получение нформации о темах
topic_info = topic_model.get_topic_info()
print(f"Найдено {len(topic_info[topic_info['Topic'] != -1])} тем")

# Сохранение темы для каждого абзаца
df['topic'] = topics
df['topic_prob'] = [max(prob) if len(prob) > 0 else 0 for prob in probs]

# Сохранение обновленного DataFrame
df.to_csv('/content/drive/MyDrive/pratchett_analysis_with_topics.csv', index=False)

Найдено 20 тем


# 1. Визуализация результатов

In [78]:
if len(topic_info[topic_info['Topic'] != -1]) > 0:
    topic_model.visualize_topics().write_html('/content/drive/MyDrive/topic_visualization.html')
    topic_model.visualize_barchart().write_html('/content/drive/MyDrive/topics_barchart.html')
    topic_model.visualize_hierarchy().write_html('/content/drive/MyDrive/topics_hierarchy.html')
else:
    print("Темы для визуализации не найдены.")

# 2. Создание матрицы связи тем и персонажей

In [79]:
character_topic_matrix = defaultdict(lambda: defaultdict(int))

for _, row in df.iterrows():
    topic_id = row['topic']
    if topic_id != -1:  # Пропускаем выбросы
        for character in row['characters']:
            if character in character_dict:
                character_topic_matrix[character][topic_id] += 1

In [80]:
# Преобразование в DataFrame для визуализации
character_topic_df = pd.DataFrame(character_topic_matrix).fillna(0).T

# Нормализация матрицы для лучшей визуализации
character_topic_norm = normalize(character_topic_df, axis=1, norm='l1')
character_topic_df_norm = pd.DataFrame(
    character_topic_norm,
    index=character_topic_df.index,
    columns=character_topic_df.columns
)

# Отбираем только основных персонажей (с наибольшим количеством связей)
main_characters = character_topic_df.sum(axis=1).sort_values(ascending=False).head(10).index
main_character_topic_df = character_topic_df_norm.loc[main_characters]

# 3. Тепловая карта связи персонажей и тем

In [81]:
if main_character_topic_df.shape[0] > 0 and main_character_topic_df.shape[1] > 0:
    plt.figure(figsize=(20, 16))
    sns.heatmap(main_character_topic_df, cmap="YlGnBu", annot=False)
    plt.title('Связь между персонажами и темами (нормализованная)', fontsize=18)
    plt.ylabel('Персонажи', fontsize=14)
    plt.xlabel('Темы', fontsize=14)
    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/character_topic_heatmap.png', dpi=300)
    plt.close()
else:
    print("Невозможно построить heatmap: мало данных!")

# 4. Сетевая визуализация связей персонажей и тем

In [82]:
G = nx.Graph()

# Добавляем узлы для персонажей и тем
for character in main_characters:
    G.add_node(character, type='character')

topic_labels = {}
for topic_id in character_topic_df.columns:
    if topic_id != -1:
        topic_words = ", ".join([word for word, _ in topic_model.get_topic(topic_id)[:5]])
        topic_label = f"Topic {topic_id}: {topic_words}"
        topic_labels[topic_id] = topic_label
        G.add_node(topic_label, type='topic')

# Добавляем ребра между персонажами и темами
for character in main_characters:
    for topic_id in character_topic_df.columns:
        if topic_id != -1 and character_topic_df.loc[character, topic_id] > 0:
            weight = character_topic_df.loc[character, topic_id]
            if weight > 5:  # Порог для визуализации
                G.add_edge(character, topic_labels[topic_id], weight=weight)

In [83]:
# Визуализация графа
if G.number_of_edges() > 0 and G.number_of_nodes() > 0:
    plt.figure(figsize=(20, 20))
    pos = nx.spring_layout(G, k=0.3, iterations=50)

    character_nodes = [node for node, attr in G.nodes(data=True) if attr.get('type') == 'character']
    topic_nodes = [node for node, attr in G.nodes(data=True) if attr.get('type') == 'topic']

    nx.draw_networkx_nodes(G, pos, nodelist=character_nodes, node_color='skyblue', node_size=800, alpha=0.8)
    nx.draw_networkx_nodes(G, pos, nodelist=topic_nodes, node_color='lightgreen', node_size=1000, alpha=0.8)
    edge_weights = [G[u][v]['weight'] / 10 for u, v in G.edges()]
    nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.5, edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')

    plt.title('Сеть связей персонажей и тем', fontsize=20)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/character_topic_network.png', dpi=300)
    plt.close()
else:
    print("Нет связей для графовой визуализации.")

# 5. Визуализация динамики тем по книгам

In [84]:
topic_book_matrix = defaultdict(lambda: defaultdict(int))

for _, row in df.iterrows():
    book = row['book']
    topic_id = row['topic']
    if topic_id != -1:
        topic_book_matrix[book][topic_id] += 1

topic_book_df = pd.DataFrame(topic_book_matrix).fillna(0)
topic_book_norm = normalize(topic_book_df, axis=0, norm='l1')
topic_book_df_norm = pd.DataFrame(
    topic_book_norm,
    index=topic_book_df.index,
    columns=topic_book_df.columns
)

# Выбираем топ-15 тем для визуализации
top_topics = topic_book_df.sum(axis=1).sort_values(ascending=False).head(15).index

# Преобразуем данные для визуализации
plot_data = []
for topic_id in top_topics:
    for book in topic_book_df.columns:
        if topic_id in topic_labels:
            plot_data.append({
                'Book': book,
                'Topic': topic_labels[topic_id],
                'Frequency': topic_book_df_norm.loc[topic_id, book]
            })

plot_df = pd.DataFrame(plot_data)