In [1]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import uuid
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import re
import json




In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
db_name = 'CASML - Generative AI Hackathon'
LLM_model_id = "Qwen/Qwen1.5-1.8B-Chat"
cross_encoder_id = "BAAI/bge-reranker-v2-m3"
embedding_model_id = "intfloat/multilingual-e5-large"

chunk_overlap=300

In [3]:
# import pdfplumber
# texts = {}

# with pdfplumber.open("data/book.pdf") as pdf:
#     text = ""
#     for i, page in enumerate(pdf.pages):
#         if i < 18 or i > 642:
#             continue
#         text = page.extract_text()
#         texts[i - 18] = text

In [4]:
# with open("data/texts.json", "w", encoding="utf-8") as f:
#     json.dump(texts, f, ensure_ascii=False, indent=2)

In [5]:
with open("data/texts.json", "r", encoding="utf-8") as f:
    texts = json.load(f)

In [6]:
for page_num in texts:
    texts[page_num] = texts[page_num].replace("\n", " ")

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=chunk_overlap)

In [9]:
text_chunks = [{i: text_splitter.split_text(texts[str(i)])} for i in tqdm(range(len(texts)))]

text_chunks_numbered = []

for chunk_dict in text_chunks:
    key, values = next(iter(chunk_dict.items()))
    for chunk in values:
        text_chunks_numbered.append((key, chunk))

# Добавляем глобальный индекс
global_indexed = [(idx, key, chunk) for idx, (key, chunk) in enumerate(text_chunks_numbered)]

global_chunk_ids, page_numbers, text_chunks = zip(*global_indexed)

100%|██████████| 625/625 [00:00<00:00, 2648.33it/s]


In [10]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(embedding_model_id, model_kwargs={'dtype': torch.float16})

In [11]:
vectors = embedding_model.encode(text_chunks, batch_size=32, device=device, normalize_embeddings=True, show_progress_bar=True).tolist()

Batches:   0%|          | 0/85 [00:00<?, ?it/s]

In [12]:
from qdrant_client import QdrantClient, models

client = QdrantClient(":memory:")

client.create_collection(
    collection_name=db_name,
    on_disk_payload=True,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE,
        on_disk=True
    ),
)

True

In [13]:
for i in tqdm(range(len(vectors))):
    client.upsert(
        collection_name=db_name,
        points=[
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors[i],
                payload={
                    'text': text_chunks[i],
                    'page': page_numbers[i] + 7,
                    'chunk_index': global_chunk_ids[i]
                }
            )
        ]
    )

100%|██████████| 2715/2715 [00:02<00:00, 1222.71it/s]


In [14]:
import json


with open('data/queries.json') as files:
    queries = json.load(files)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLM_model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    LLM_model_id,
    dtype=torch.float16,
    device_map="auto"
)

In [80]:
def generation_pipeline(messages, max_new_tokens=512, do_sample=True, temperature=0.5, top_p=0.9) -> str:
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
    answer = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return answer

In [46]:
rephrase_prompt = "You are a query rewriter specialized in mental health and psychology topics. Your task is to rewrite the user’s question into short, clear search queries for vector retrieval.\n\nSTRICT RULES:\n- Output EXACTLY THREE rewritten query variants.\n- Separate them each on a NEW LINE.\n- DO NOT explain, comment, justify, or describe anything.\n- DO NOT answer the question.\n- Output ONLY the rewritten queries."

In [47]:
def rephrase_query(query_text) -> list[str]:
    messages = [
        {"role": "system", "content": rephrase_prompt},
        {"role": "user", "content": f"Rephrase this question into exactly three short search queries separated by '\n'. Do not explain anything. Question: {query_text}"},
    ]
    output = generation_pipeline(messages, max_new_tokens=512, do_sample=True, temperature=0.3, top_p=0.9)
    parts = output.split("\n")
    result = [p.strip() for p in parts][:3]
    return result

In [None]:
# Генерируем перефразированные запросы

# for i in tqdm(range(len(queries))):
#     print(queries[i]['question'])
#     new_queries = rephrase_query(queries[i]['question'])
#     queries[i]['new_queries'] = new_queries



What is the scientific method in psychology?




1. "What is the scientific method in psychology?"
2. "How does the scientific method apply to psychological research?"
3. "What principles guide the process of conducting a scientific study in psychology?"
What are the basic parts of a neuron?




1. Basic Neuron Parts Query: "What are the fundamental components of a neuron?"
2. Simplified Neuron Parts Query: "Identify the essential elements of a neuron."
3. Detailed Neuron Parts Query: "Provide a comprehensive list of the primary components found within a neuron."
What are the stages of sleep?




1. Stage 1 (NREM): This is the first stage of sleep, characterized by rapid eye movements, increased muscle tone, and reduced brain activity. It lasts about 30 minutes and is followed by deeper stages of sleep, such as NREM II and III, which last around 90 minutes.

2. Stage 2 (REM): The second stage of sleep, also known as Rapid Eye Movement (REM), occurs when the brain is more active and focused, producing vivid dreams and dreaming experiences. REM sleep typically lasts for about an hour and a half, followed by another stage called Stage 3 (Non-Rapid Eye Movement) that lasts for about 45 minutes.

3. Stage 3 (Non-Rapid Eye Movement): This stage of sleep, also known as Non-Rapid Eye Movement (NREM), is characterized by slow-wave sleep, during which the body relaxes and the brain releases hormones like melatonin. NREM sleep can last for up to two hours and is essential for memory consolidation, mood regulation, and overall physical well-being.
What is operant conditioning?




1. "What is operant conditioning and how does it work?"
2. "How does operant conditioning impact behavior through reinforcement learning?"
3. "In operant conditioning, what is the concept of 'negative reinforcement' and its role in shaping behavior?"
What is problem-solving in psychology?




1. "Problem-solving in psychology: Definition, techniques, and applications"
2. "Exploring the role of problem-solving in psychological research and practice"
3. "The impact of problem-solving skills on mental health outcomes"
What are the three stages of memory?


 10%|█         | 5/50 [00:13<01:59,  2.65s/it]


KeyboardInterrupt: 

In [None]:
# with open("data/new_queries.json", "w", encoding="utf-8") as f:
#     json.dump(queries, f, ensure_ascii=False, indent=2)

In [None]:
# del model

# import gc
# gc.collect()

# torch.cuda.empty_cache()

In [49]:
with open('data/new_queries.json') as files:
    queries = json.load(files)

In [51]:
def get_candidates(queries, top_k_db=5):
    results = {}

    for query in tqdm(queries):
        qid = query['query_id']
        query_texts = [query['question']] + query['new_queries']

        # собираем все попадания отсюда
        all_hits = []

        for qt in query_texts:
            vec = embedding_model.encode(
                qt,
                normalize_embeddings=True,
                device=device
            ).tolist()

            hits = client.search(
                collection_name=db_name,
                query_vector=vec,
                limit=top_k_db
            )

            for hit in hits:
                all_hits.append({
                    "text": hit.payload['text'],
                    "page": hit.payload['page'],
                    "score": hit.score,
                    "chunk_index": hit.payload['chunk_index']
                })

        dedup = {}
        for item in all_hits:
            ci = item["chunk_index"]
            if ci not in dedup or item["score"] > dedup[ci]["score"]:
                dedup[ci] = item

        # Берем top_k_db
        final_items = sorted(dedup.values(), key=lambda x: x["score"], reverse=True)[:top_k_db]

        # логгирование
        if final_items:
            scores = [i["score"] for i in final_items]
            print(f"[{qid}] score min: {min(scores):.4f}, max: {max(scores):.4f}")

        results[qid] = final_items

    return results


relevant = get_candidates(queries, top_k_db=40)


  hits = client.search(


[1] score min: 0.8150, max: 0.8551
[2] score min: 0.8090, max: 0.8496
[3] score min: 0.8137, max: 0.8602





[4] score min: 0.8222, max: 0.8636
[5] score min: 0.8065, max: 0.8348
[6] score min: 0.8155, max: 0.8431


 12%|█▏        | 6/50 [00:00<00:04,  8.86it/s][A


[7] score min: 0.8104, max: 0.8425
[8] score min: 0.8046, max: 0.8744
[9] score min: 0.8163, max: 0.8656


 18%|█▊        | 9/50 [00:01<00:04,  9.47it/s][A

[10] score min: 0.8081, max: 0.8656
[11] score min: 0.8107, max: 0.8377
[12] score min: 0.7686, max: 0.8689





[13] score min: 0.8011, max: 0.8445
[14] score min: 0.8096, max: 0.8466
[15] score min: 0.8077, max: 0.8586


 30%|███       | 15/50 [00:01<00:03, 10.18it/s][A

[16] score min: 0.8284, max: 0.8846
[17] score min: 0.7945, max: 0.8766





[18] score min: 0.8003, max: 0.8684
[19] score min: 0.7989, max: 0.8505
[20] score min: 0.8122, max: 0.8748


 40%|████      | 20/50 [00:02<00:03,  9.89it/s][A

[21] score min: 0.8098, max: 0.8494
[22] score min: 0.8105, max: 0.8576
[23] score min: 0.7998, max: 0.8759





[24] score min: 0.8092, max: 0.8531
[25] score min: 0.8150, max: 0.8641
[26] score min: 0.8135, max: 0.8610


 52%|█████▏    | 26/50 [00:02<00:02,  9.93it/s][A


[27] score min: 0.8124, max: 0.8487
[28] score min: 0.8208, max: 0.8538
[29] score min: 0.8062, max: 0.8768


 58%|█████▊    | 29/50 [00:03<00:02,  9.85it/s][A

[30] score min: 0.8223, max: 0.8650
[31] score min: 0.8182, max: 0.8584
[32] score min: 0.8204, max: 0.8542





[33] score min: 0.8059, max: 0.8603
[34] score min: 0.7960, max: 0.8498
[35] score min: 0.7981, max: 0.8415


 70%|███████   | 35/50 [00:03<00:01, 10.22it/s][A


[36] score min: 0.8166, max: 0.8420
[37] score min: 0.8235, max: 0.8461
[38] score min: 0.8271, max: 0.8768


 76%|███████▌  | 38/50 [00:03<00:01,  9.75it/s][A


[39] score min: 0.8183, max: 0.8701
[40] score min: 0.8147, max: 0.8505
[41] score min: 0.8235, max: 0.8537


 82%|████████▏ | 41/50 [00:04<00:00,  9.91it/s][A

[42] score min: 0.8234, max: 0.8682
[43] score min: 0.8062, max: 0.8449
[44] score min: 0.8095, max: 0.8446




[45] score min: 0.8102, max: 0.8338
[46] score min: 0.8080, max: 0.8441





[47] score min: 0.8148, max: 0.8370
[48] score min: 0.8187, max: 0.8436
[49] score min: 0.8151, max: 0.8465


100%|██████████| 50/50 [00:05<00:00,  9.65it/s][A

[50] score min: 0.8126, max: 0.8590





In [52]:
def filter_by_db_score(relevant, score_threshold=0.0):
    filtered = {}

    for qid, items in relevant.items():
        selected = [item for item in items if item["score"] >= score_threshold]
        filtered[qid] = selected

    return filtered

# relevant = filter_by_db_score(relevant, score_threshold=0.8)


In [53]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder(cross_encoder_id, device=device)

In [54]:
def rerank_results(queries, relevant, top_k_rerank=3, rerank_threshold=None):
    reranked = {}

    for query in tqdm(queries):
        qid = query['query_id']
        q_text = query['question']

        candidates = relevant[qid]
        texts = [c["text"] for c in candidates]

        if len(texts) == 0:
            reranked[qid] = []
            continue

        pairs = [(q_text, t) for t in texts]
        scores = cross_encoder.predict(pairs)  # по аналогии с твоим примером

        scored = [
            {**c, "rerank_score": s}
            for c, s in zip(candidates, scores)
        ]

        scored = sorted(scored, key=lambda x: x["rerank_score"], reverse=True)
        scored = scored[:top_k_rerank]

        if rerank_threshold is not None:
            scored = [s for s in scored if s["rerank_score"] >= rerank_threshold]

        reranked[qid] = scored

    return reranked


In [55]:
relevant_reranked = rerank_results(
    queries,
    relevant,
    top_k_rerank=6,         # регулируй
    rerank_threshold=None   # или поставь, например, 0.1
)

100%|██████████| 50/50 [00:19<00:00,  2.58it/s]


In [59]:
# Очищаем память, чтобы LLM было вкусно

# del cross_encoder
# del embedding_model

import gc
gc.collect()


torch.cuda.empty_cache()

In [60]:
tokenizer = AutoTokenizer.from_pretrained(LLM_model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    LLM_model_id,
    dtype=torch.float16,
    device_map="auto"
)


In [89]:
def llm_answer(query, context):
    system_msg = (
        "You are an expert in psychology. Using only the provided retrieved documents, answer the following question. Do not add any external knowledge."
    )

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "system", "content": f"Context documents:\n{context}"},
        {"role": "user", "content": query}
    ]

    answer = generation_pipeline(messages, temperature=0.7, max_new_tokens=1024)
    return answer


In [None]:
# import requests

# def llm_answer(query, context):
#     system_msg = (
#         "You are an expert in psychology. Using only the provided retrieved documents, answer the following question. Do not add any external knowledge."
#     )

#     messages = [
#         {"role": "system", "content": system_msg},
#         {"role": "system", "content": f"Context documents:\n{context}"},
#         {"role": "user", "content": query}
#     ]

#     prompt = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True
#     )
#     prompt = "Привет, как дела?"
#     payload = {
#         "prompt": prompt,
#         "max_new_tokens": 512,
#         "do_sample": True,
#         "temperature": 0.9,
#         "top_p": 0.9
#     }

#     r = requests.post("https://shortly-pleasant-democrats-discussion.trycloudflare.com/generate", json=payload)
#     r.raise_for_status()

#     # Модель возвращает полный текст, нужно вырезать только продолжение
#     response_text = r.json()["response"]

#     print(response_text)
#     # Если нужно вырезать ввод с контекстом, делаем так:
#     if response_text.startswith(prompt):
#         response_text = response_text[len(prompt):].strip()

#     return response_text


In [None]:
# def reorder_chunks(chunks):
#     n = len(chunks)
#     if n <= 3:
#         return chunks  # просто как есть

#     first_three = chunks[:3]
#     # далее позиции считаем относительно chunks[3:]
#     odds = chunks[3::2]   # 4-й, 6-й, 8-й...
#     evens = chunks[4::2]  # 5-й, 7-й, 9-й...
#     evens = evens[::-1]   # развернуть

#     return first_three + odds + evens

# # пример формирования вывода
# def format_chunks(chunks):
#     return "\n".join(f"{i}. {c}" for i, c in enumerate(reorder_chunks(chunks), start=1))


In [90]:
from toc import toc

def find_section_for_page(toc, page):
    all_sections = []

    for chapter_data in toc.values():
        chapter_title = chapter_data["title"]
        for section_title, start in chapter_data["sections"].items():
            all_sections.append((chapter_title, section_title, start))

    # сортируем по старту секции
    all_sections.sort(key=lambda x: x[2])

    # ищем последнюю секцию, начавшуюся не позже страницы
    candidates = [(ch, sec, start) for ch, sec, start in all_sections if start <= page]
    if not candidates:
        return None

    ch, sec, _ = candidates[-1]
    return f"{ch}/{sec}"

In [91]:
def reorder_pairs(pairs):
    # pairs: список кортежей (chunk_text, page)
    n = len(pairs)
    if n <= 3:
        return pairs

    first_three = pairs[:3]
    odds = pairs[3::2]   # 4-й, 6-й, 8-й...
    evens = pairs[4::2]  # 5-й, 7-й, 9-й...
    evens = evens[::-1]

    return first_three + odds + evens

def format_pairs(pairs):
    lines = []
    for i, (chunk, page) in enumerate(pairs, start=1):
        sec = find_section_for_page(toc, page)
        if sec is None:
            sec = "Unknown"
        lines.append(f"{i}. {sec} - {chunk}")
    return "\n".join(lines)

In [92]:
outputs = []
result_pages = []

for query in tqdm(queries):
    qid = query['query_id']
    q_text = query['question']
    data = relevant_reranked[qid]

    # собираем пары (текст, страница)
    pairs = [(item["text"], item["page"]) for item in data]

    # переставляем
    reordered = reorder_pairs(pairs)

    # делаем контекст с секциями
    context = format_pairs(reordered)

    answer = llm_answer(q_text, context)

    outputs.append(answer)
    result_pages.append([p for _, p in reordered])


100%|██████████| 50/50 [10:26<00:00, 12.53s/it]


In [93]:
def merge_sequential(items, overlap: int) -> str:
    if not items:
        return ""
    items = sorted(items, key=lambda x: x["chunk_index"])

    merged_groups = []
    cur_text = items[0]["text"]
    prev_idx = items[0]["chunk_index"]

    for it in items[1:]:
        idx, t = it["chunk_index"], it["text"]
        if idx == prev_idx + 1:
            # учёт overlap: убираем дублирующийся префикс t
            max_k = min(overlap, len(cur_text), len(t))
            cut = 0
            for k in range(max_k, 0, -1):
                if cur_text[-k:] == t[:k]:
                    cut = k
                    break
            cur_text += t[cut:]
        else:
            merged_groups.append(cur_text)
            cur_text = t
        prev_idx = idx

    merged_groups.append(cur_text)
    return "\n".join(merged_groups)

In [94]:
ids = [q['query_id'] for q in queries]
references = [{"sections": [], "pages": list(set(pages))} for pages in result_pages]

for ref in references:
    for page in ref["pages"]:
        section = find_section_for_page(toc, page)
        if section:
            section = section.lower().replace(" ", "_")
            ref["sections"].append(section)

context_text = [
    merge_sequential(relevant_reranked[qid], chunk_overlap)
    for qid in ids
]

In [95]:
submission_df = pd.DataFrame({
    "ID": ids,
    "context": context_text,
    "answer": outputs,
    "references": references
})

In [96]:
submission_df.to_csv("data/submission8.csv", index=False)