In [3]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime

# Загрузка текста статьи
url = "https://blog.dzencode.com/ru/illyuziya-kachestva-vash-sayt-idealen-pozdravlyaem-vy-tolko-chto-sozhgli-byudzhet/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
article_text = soup.get_text(separator='\n')

# Сохранение текста
with open('artifacts/article.txt', 'w', encoding='utf-8') as f:
    f.write(article_text)

# Метаданные
metadata = {
    "url": url,
    "language": "ru",
    "date": "unknown",  # Дата не указана, можно уточнить позже
    "topic": "Иллюзия качества в веб-дизайне",
    "project": "RAG_Pipeline_Test",
    "lang": "ru"
}
with open('artifacts/metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

In [8]:
import re

# Чтение исходного текста
with open('artifacts/article.txt', 'r', encoding='utf-8') as f:
    article_text = f.read()

# Предобработка: удаление HTML-тегов и лишних символов
cleaned_text = re.sub(r'<.*?>', '', article_text)  # Удаление HTML-тегов
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Замена множества пробелов на один
cleaned_lines = cleaned_text.splitlines()  # Разбиение по любым разрывам строк
cleaned_text = '\n'.join(line.strip() for line in cleaned_lines if line.strip())  # Фильтр пустых строк

# Сохранение чистого текста
with open('artifacts/cleaned_article.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Чтение очищенного текста
with open('artifacts/cleaned_article.txt', 'r', encoding='utf-8') as f:
    cleaned_text = f.read()

# Разбиение на чанки
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len
)
chunks = text_splitter.split_text(cleaned_text)

# Подготовка данных для JSONL с ID и метаданными
data = []
for i, chunk in enumerate(chunks):
    entry = {
        "id": i,
        "text": chunk,
        "metadata": {
            "url": "https://blog.dzencode.com/ru/illyuziya-kachestva-vash-sayt-idealen-pozdravlyaem-vy-tolko-chto-sozhgli-byudzhet/",
            "language": "ru",
            "date": "unknown",
            "topic": "Иллюзия качества в веб-дизайне",
            "project": "RAG_Pipeline_Test"
        }
    }
    data.append(entry)

# Сохранение в JSONL
with open('artifacts/rag_article.jsonl', 'w', encoding='utf-8') as f:
    for entry in data:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

In [17]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import json
import pickle

# Инициализация модели
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Чтение чанков из JSONL
chunks = []
with open('artifacts/rag_article.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        chunks.append(json.loads(line))