In [1]:
import os
import re
import json
import torch
from uuid import uuid4
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from qdrant_client.models import PointStruct
from transformers import AutoTokenizer, AutoModel
import uuid

In [2]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

In [3]:
MODEL_NAME = "intfloat/multilingual-e5-large"
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

In [4]:
#Парсинг .md файлов в json
def parse_md_to_json(md_path: str, program_tag: str, output_path: str):
    with open(md_path, "r", encoding="utf-8") as f:
        text = f.read()

    pattern = r"\|\s*(\d{1,2}|[1,2,3,4](?:,\s*\d)?|)\s*\|\s*(.+?)\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|"
    matches = re.findall(pattern, text)

    data = []
    for match in matches:
        semester_raw, name, credits, hours = match
        semesters = [int(s.strip()) for s in semester_raw.split(",") if s.strip().isdigit()]
        if not semesters:
            semesters = [None]
        for sem in semesters:
            data.append({
                "program": program_tag,
                "semester": sem,
                "name": name.strip(),
                "credits": int(credits),
                "hours": int(hours)
            })

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

parse_md_to_json("ai.md", "Искусственный интеллект", "ai.json")
parse_md_to_json("ai_product.md", "Управление ИИ-продуктами", "ai_product.json")

In [5]:
#Загрузка json с учебными планами и json с описанием программы (парсинг с помощью LLM)
with open('ai.json', 'r', encoding='utf-8') as file:
    ai_disciplines = json.load(file)
with open('ai_product.json', 'r', encoding='utf-8') as file:
    ai_product_disciplines = json.load(file)
with open('itmo_ai.json', 'r', encoding='utf-8') as file:
    ai_about= json.load(file)
with open('itmo_ai_product.json', 'r', encoding='utf-8') as file:
    ai_product_about = json.load(file)

In [6]:
def map_section_name(section: str) -> str:
    section_lower = section.lower()
    if "бюджетные места" in section_lower:
        return "Количество мест. бюджетные места"
    elif "контрактые места" in section_lower or "платн" in section_lower:
        return "Количество мест. контрактные (платные) места"
    elif "целевые места" in section_lower:
        return "Количество мест. целевые места"
    return section

In [12]:
#Создание чанков из описаний учебных планов
def chunk_disciplines(disciplines):
    chunks = []
    for entry in disciplines:
        if entry["semester"] is None or "|" in entry["name"]:
            continue
        chunks.append({
            "раздел": entry.get("name"),
            "текст": f"Название дисциплины: {entry['name']}. Семестр: {entry['semester']}. "
                     f"Зачётных единиц: {entry['credits']}. Часов: {entry['hours']}.",
            "program": entry["program"]
        })
    return chunks

In [9]:
#Создание чанков из описаний образовательных программ
def chunk_text(text, max_sentences=3):
    text = text.replace("\t", " ")
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunk = " ".join(sentences[i:i+max_sentences])
        if len(chunk) > 0:
            chunks.append(chunk)
    return chunks

In [14]:
def chunk_program_info(obj, program_name, path=""):
    chunks = []
    if isinstance(obj, str):
        if re.match(r'^https?://', obj):
            return []
        for chunk in chunk_text(obj):
            section_name = map_section_name(path)
            full_text = f"{path}: {chunk}" if path else chunk
            chunks.append({
                "раздел": section_name,
                "текст": full_text,
                "program": program_name})
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            chunks += chunk_program_info(item, program_name, f"{path}[{i}]")
    elif isinstance(obj, dict):
        for key, value in obj.items():
            new_path = f"{path}.{key}" if path else key
            chunks += chunk_program_info(value, program_name, new_path)
    return chunks

In [17]:
# Функция для создания эмбеддингов
def embed_chunk(text):
    return model.encode(f"passage: {text}", normalize_embeddings=True)

In [15]:
# Подготовка всех чанков
all_chunks = chunk_disciplines(ai_disciplines) + \
             chunk_disciplines(ai_product_disciplines) + \
             chunk_program_info(ai_about, "ai") + \
             chunk_program_info(ai_product_about, "ai_product")

In [21]:
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE) 
)

  client.recreate_collection(


True

In [18]:
for chunk in all_chunks:
    chunk["embedding"] = embed_chunk(chunk["текст"]).tolist()

In [22]:
# Подготовка всех PointStruct
points = [
    PointStruct(
        id=uuid.uuid4().hex,
        vector=chunk["embedding"],
        payload={
            "program": chunk["program"].lower(),  # название программы вынесено в качестве тега фильтрации
            "раздел": chunk["раздел"],
            "текст": chunk["текст"]
        }
    )
    for chunk in all_chunks
]

# Функция для разбиения на батчи
def batch(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i + batch_size]

# Загрузка по 200 штук
for i, batch_points in enumerate(batch(points, 200), 1):
    client.upsert(collection_name=COLLECTION_NAME, points=batch_points)
    print(f"Загружен батч {i} ({len(batch_points)} точек)")

Загружен батч 1 (200 точек)
Загружен батч 2 (194 точек)
