In [1]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from torch.utils.data import Dataset as TorchDataset
import random
from tqdm import tqdm





In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
db_name = 'CASML - Generative AI Hackathon'
LLM_model_id = "Qwen/Qwen1.5-1.8B-Chat"
cross_encoder_id = "BAAI/bge-reranker-v2-m3"
embedding_model_id = "intfloat/multilingual-e5-large"

chunk_overlap = 300
chunk_size = 1000

In [3]:
with open("data/texts.json", "r", encoding="utf-8") as f:
    texts = json.load(f)

In [4]:
import re

def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
    if header_patterns is None:
        header_patterns = [r'^.*Header.*$']
    if footer_patterns is None:
        footer_patterns = [r'^.*Footer.*$']

    for pattern in header_patterns + footer_patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)

    return text.strip()

def remove_special_characters(text, special_chars=None):
    if special_chars is None:
        special_chars = r'[^A-Za-z0-9\s\.,;:\'\"\?\!\-]'

    text = re.sub(special_chars, '', text)
    return text.strip()

def remove_repeated_substrings(text, pattern=r'\.{2,}'):
    text = re.sub(pattern, '.', text)
    return text.strip()

def remove_extra_spaces(text):
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def preprocess_text(text, metadata=None, language=None):
    # optional: detect language, skip if not target
    # remove headers/footers
    text = remove_headers_footers(text)
    # fix unicode quirks
    # text = ftfy.fix_text(text)
    # remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove URLs/emails
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    # normalize whitespace
    text = remove_extra_spaces(text)
    # remove special characters
    text = remove_special_characters(text)
    # lowercase (optional)
    text = text.lower()
    # lemmatize (optional) — using spaCy or any other
    # chunking can happen here or after
    return text.strip()

In [5]:
for page_num in texts:
    texts[page_num] = texts[page_num].replace("\n", " ")
    texts[page_num] = preprocess_text(texts[page_num])

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [7]:
from nltk.tokenize import sent_tokenize
from statistics import mean, median


# Чанкинг по смысловым блокам

cross_encoder = CrossEncoder(cross_encoder_id, device=device)

semantic_chunks = {}

semantic_similarity_threshold = 0.15


buffer_lengths = []  # сюда будем сохранять длины чанков по числу предложений
scores = []

for i in tqdm(range(len(texts))):
    text = texts[str(i)]
    sentences = sent_tokenize(text)

    joined_sentences = []
    buffer = sentences[0]
    max_buffer = 1

    for j in range(1, len(sentences)):
        s1 = buffer.split()[-50:]
        s2 = sentences[j]
        score = cross_encoder.predict([(" ".join(s1), s2)])[0]
        scores.append(score)

        if score > semantic_similarity_threshold:
            buffer += " " + s2
            max_buffer += 1
        else:
            joined_sentences.append(buffer.strip())
            buffer_lengths.append(max_buffer)
            buffer = s2
            max_buffer = 1

    if buffer:
        joined_sentences.append(buffer.strip())
        buffer_lengths.append(max_buffer)

    refined_chunks = []
    for chunk in joined_sentences:
        if len(chunk) > chunk_size:
            refined_chunks.extend(text_splitter.split_text(chunk))
        else:
            refined_chunks.append(chunk)

    semantic_chunks[i] = refined_chunks

# выравниваем в общий список
text_chunks_numbered = [(i, chunk) for i, chunks in semantic_chunks.items() for chunk in chunks]

# глобальная нумерация чанков
global_indexed = [(idx, key, chunk) for idx, (key, chunk) in enumerate(text_chunks_numbered)]
global_chunk_ids, page_numbers, text_chunks = zip(*global_indexed)


# статистика по длинам
print("\n--- Chunk length statistics ---")
print(f"Total chunks: {len(buffer_lengths)}")
print(f"Min length: {min(buffer_lengths)}")
print(f"Max length: {max(buffer_lengths)}")
print(f"Mean length: {mean(buffer_lengths):.2f}")
print(f"Median length: {median(buffer_lengths):.2f}")

100%|██████████| 625/625 [02:10<00:00,  4.77it/s]


--- Chunk length statistics ---
Total chunks: 7078
Min length: 1
Max length: 14
Mean length: 1.55
Median length: 1.00





In [8]:
# Очищаем память, чтобы мне было вкусно

del cross_encoder

import gc
gc.collect()


torch.cuda.empty_cache()

In [9]:
from toc import toc

def find_section_for_page(toc, page):
    all_sections = []

    for chapter_data in toc.values():
        chapter_title = chapter_data["title"]
        for section_title, start in chapter_data["sections"].items():
            all_sections.append((chapter_title, section_title, start))

    # сортируем по старту секции
    all_sections.sort(key=lambda x: x[2])

    # ищем последнюю секцию, начавшуюся не позже страницы
    candidates = [(ch, sec, start) for ch, sec, start in all_sections if start <= page]
    if not candidates:
        return None

    ch, sec, _ = candidates[-1]
    return f"{ch}/{sec}", ch, sec

In [10]:
train_df = pd.DataFrame({
    "text": text_chunks,
    "page_num": page_numbers,
})

In [11]:
train_df['page_num'] += 7

In [12]:
cats, chaps, secs = zip(*train_df["page_num"].map(lambda p: find_section_for_page(toc, p)))
train_df["category"] = cats
train_df["chapter"] = chaps
train_df["section"] = secs

In [13]:
tokenizer = AutoTokenizer.from_pretrained(embedding_model_id)

class TripletTextDataset(TorchDataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Группировка по главам и категориям
        self.groups_by_cat = df.groupby("category")["text"].apply(list).to_dict()
        self.groups_by_chapter = df.groupby("chapter")["category"].apply(set).to_dict()
        self.categories = list(self.groups_by_cat.keys())
        self.chapters = list(self.groups_by_chapter.keys())

        # category → chapter
        self.cat2chapter = dict(zip(df["category"], df["chapter"]))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        anchor_text = self.df.loc[idx, "text"]
        anchor_cat = self.df.loc[idx, "category"]
        anchor_chapter = self.df.loc[idx, "chapter"]

        # положительный пример из той же категории (той же секции)
        pos_text = random.choice(self.groups_by_cat[anchor_cat])

        # выбираем категорию из другой главы
        other_chapters = [ch for ch in self.chapters if ch != anchor_chapter]
        neg_chapter = random.choice(other_chapters)
        neg_cat = random.choice(list(self.groups_by_chapter[neg_chapter]))
        neg_text = random.choice(self.groups_by_cat[neg_cat])

        def tok(t):
            return self.tokenizer(
                t, truncation=True, padding="max_length",
                max_length=self.max_length, return_tensors="pt"
            )

        anchor = tok(anchor_text)
        positive = tok(pos_text)
        negative = tok(neg_text)

        return {
            "anchor_input_ids": anchor["input_ids"].squeeze(),
            "anchor_attention_mask": anchor["attention_mask"].squeeze(),
            "positive_input_ids": positive["input_ids"].squeeze(),
            "positive_attention_mask": positive["attention_mask"].squeeze(),
            "negative_input_ids": negative["input_ids"].squeeze(),
            "negative_attention_mask": negative["attention_mask"].squeeze(),
        }

In [14]:
class TripletModel(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.pooling = lambda x, mask: (x * mask.unsqueeze(-1)).sum(1) / mask.sum(1, keepdim=True)

    def encode(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.pooling(outputs.last_hidden_state, attention_mask)
        return torch.nn.functional.normalize(pooled, p=2, dim=1)

    def forward(self, anchor_input_ids, anchor_attention_mask,
                positive_input_ids, positive_attention_mask,
                negative_input_ids, negative_attention_mask, **kwargs):
        anchor_emb = self.encode(anchor_input_ids, anchor_attention_mask)
        pos_emb = self.encode(positive_input_ids, positive_attention_mask)
        neg_emb = self.encode(negative_input_ids, negative_attention_mask)
        return anchor_emb, pos_emb, neg_emb


class TripletLoss(torch.nn.Module):
    def __init__(self, margin=0.3):
        super().__init__()
        self.loss_fn = torch.nn.TripletMarginLoss(margin=margin)

    def forward(self, anchor_emb, pos_emb, neg_emb, **kwargs):
        return self.loss_fn(anchor_emb, pos_emb, neg_emb)

In [15]:
class TripletTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        anchor_emb, pos_emb, neg_emb = model(**inputs)
        loss = TripletLoss()(anchor_emb, pos_emb, neg_emb)
        return (loss, (anchor_emb, pos_emb, neg_emb)) if return_outputs else loss

In [16]:
train_dataset = TripletTextDataset(train_df, tokenizer)

model = TripletModel(embedding_model_id)

In [17]:
training_args = TrainingArguments(
    output_dir="./triplet_model",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',

    gradient_accumulation_steps=8,

    fp16=True,
    logging_steps=50,
    
    save_strategy="epoch",
    remove_unused_columns=False,
    report_to="none",
)


trainer = TripletTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [18]:
trainer.train()

Step,Training Loss
50,0.2634
100,0.2344
150,0.1762
200,0.1406
250,0.1468
300,0.1386
350,0.1408
400,0.1311
450,0.1303
500,0.1262


TrainOutput(global_step=2796, training_loss=0.08668293875345004, metrics={'train_runtime': 6051.3482, 'train_samples_per_second': 3.696, 'train_steps_per_second': 0.462, 'total_flos': 0.0, 'train_loss': 0.08668293875345004, 'epoch': 3.0})

In [20]:
# # === 5. Сохранение эмбеддинг-модели ===

model.encoder.save_pretrained("./triplet_finetuned_encoder")
tokenizer.save_pretrained("./triplet_finetuned_encoder")

('./triplet_finetuned_encoder\\tokenizer_config.json',
 './triplet_finetuned_encoder\\special_tokens_map.json',
 './triplet_finetuned_encoder\\sentencepiece.bpe.model',
 './triplet_finetuned_encoder\\added_tokens.json',
 './triplet_finetuned_encoder\\tokenizer.json')