<a href="https://colab.research.google.com/github/21092004Goda/data_anal/blob/main/RAG_system_lab_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
%%capture
!pip install --quiet langchain_huggingface
!pip install --quiet sentence-transformers
!pip install --quiet faiss-gpu
!pip install --quiet rapidfuzz
!pip install --quiet arxiv

In [6]:
import arxiv
from typing import List, Dict, Any

class ArxivTopicFetcher:

    def __init__(self, max_results_per_request: int = 100, delay_seconds: float = 3.0):
        self.client = arxiv.Client(
            page_size=max_results_per_request,
            delay_seconds=delay_seconds,
            num_retries=3
        )
        self.common_categories = {
            'machine_learning': 'cs.LG',
            'artificial_intelligence': 'cs.AI',
            'computer_vision': 'cs.CV',
            'nlp': 'cs.CL',
            'robotics': 'cs.RO',
            'databases': 'cs.DB',
            'security': 'cs.CR',
            'networks': 'cs.NI',
            'algorithms': 'cs.DS',
            'hci': 'cs.HC'
        }

    def build_query(self, category: str) -> str:
        if category in self.common_categories:
            category = self.common_categories[category]
        return f"cat:{category}"

    def fetch_articles(self, query: str, max_results: int = 50,
                       sort_by: str = 'submittedDate',
                       sort_order: str = 'descending') -> List[Dict[str, Any]]:

        sort_criterion = {
            'relevance': arxiv.SortCriterion.Relevance,
            'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate,
            'submittedDate': arxiv.SortCriterion.SubmittedDate
        }.get(sort_by, arxiv.SortCriterion.SubmittedDate)

        search = arxiv.Search(
            query=query,
            max_results=min(max_results, 2000),
            sort_by=sort_criterion,
            sort_order=arxiv.SortOrder.Descending if sort_order == 'descending'
                     else arxiv.SortOrder.Ascending
        )

        print(f"Запрос: {query}")
        articles = []

        try:
            for result in self.client.results(search):
                articles.append({
                    'arxiv_id': result.entry_id.split('/')[-1],
                    'title': result.title,
                    'authors': [a.name for a in result.authors],
                    'abstract': result.summary.replace('\n', ' '),
                    'published': result.published.date() if result.published else None,
                    'categories': result.categories,
                    'pdf_url': result.pdf_url
                })

            print(f"Получено статей: {len(articles)}")

        except Exception as e:
            print("Ошибка:", e)

        return articles

    def fetch_by_category(self, category: str, max_results: int = 50, **kwargs):
        query = self.build_query(category)
        return self.fetch_articles(query, max_results, **kwargs)

    def print_summary(self, articles: List[Dict[str, Any]], n: int = 5):
        if not articles:
            print("Пусто.")
            return

        print("\n=== Короткий обзор ===")
        for idx, a in enumerate(articles[:n]):
            print(f"\n{idx+1}. {a['title']}")
            print("   Авторы:", ", ".join(a['authors'][:3]) + (" и др." if len(a['authors']) > 3 else ""))
            print("   Дата:", a['published'])
            print("   ID:", a['arxiv_id'])
            print("   Категории:", ", ".join(a['categories']))
            print("   Абстракт:", a['abstract'][:200], "...")


In [7]:
import pandas as pd
import matplotlib.pyplot as plt

class TextChunker:

    def __init__(self, chunk_size: int = 500, overlap: int = 50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text: str):
        if not text or not isinstance(text, str):
            return []

        words = text.split()
        chunks = []
        start = 0
        while start < len(words):
            end = start + self.chunk_size
            chunk_words = words[start:end]
            if not chunk_words:
                break
            chunks.append(" ".join(chunk_words))
            start = end - self.overlap

        return chunks

    def chunk_many(self, texts):
        return [self.chunk_text(t) for t in texts]

    def to_dataframe(self, articles):
        rows = []
        for a in articles:
            chunks = self.chunk_text(a["abstract"])
            rows.append({
                "id": a["arxiv_id"],
                "title": a["title"],
                "authors": ", ".join(a["authors"]),
                "published": a["published"],
                "categories": ", ".join(a["categories"]),
                "pdf_url": a["pdf_url"],
                "abstract": a["abstract"],
                "chunks": chunks
            })
        return pd.DataFrame(rows)

    def chunk_statistics(self, df: pd.DataFrame, plot: bool = False):
        chunk_counts = df["chunks"].apply(len)
        stats = {
            "Total articles": len(df),
            "Total chunks": int(chunk_counts.sum()),
            "Min chunks per article": int(chunk_counts.min()),
            "Max chunks per article": int(chunk_counts.max()),
            "Mean chunks per article": float(chunk_counts.mean()),
            "Median chunks per article": float(chunk_counts.median())
        }

        print("\n=== Chunking Statistics ===")
        for k, v in stats.items():
            print(f"{k}: {v}")

        if plot:
            plt.figure(figsize=(8,4))
            plt.hist(chunk_counts, bins=range(1, chunk_counts.max()+2), alpha=0.7, color='skyblue', edgecolor='black')
            plt.title("Distribution of Chunks per Article")
            plt.xlabel("Number of Chunks")
            plt.ylabel("Number of Articles")
            plt.xticks(range(1, chunk_counts.max()+2))
            plt.show()

        return stats


In [8]:
import numpy as np
import pandas as pd
import faiss
from langchain_huggingface import HuggingFaceEmbeddings

class ArxivVectorPipeline:

    def __init__(
        self,
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        device="cpu",
        normalize=False
    ):
        self.embeddings_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={"device": device},
            encode_kwargs={"normalize_embeddings": normalize}
        )

        self.index = None
        self.embedding_dim = None
        self.chunks_df = None
        self.embeddings = None

    def _flatten_chunks(self, df: pd.DataFrame):
        rows = []
        for _, row in df.iterrows():
            base = {
                "id": row["id"],
                "title": row["title"],
                "authors": row["authors"],
                "published": row["published"],
                "categories": row["categories"],
                "pdf_url": row["pdf_url"],
                "abstract": row["abstract"]
            }

            for idx, ch in enumerate(row["chunks"]):
                rows.append({
                    **base,
                    "chunk_id": idx,
                    "text_chunk": ch
                })

        return pd.DataFrame(rows)

    def _embed(self, texts):
        if not texts:
            return np.array([])

        emb = self.embeddings_model.embed_documents(texts)
        return np.array(emb, dtype=np.float32)

    def build(self, df: pd.DataFrame):
        self.chunks_df = self._flatten_chunks(df)
        texts = self.chunks_df["text_chunk"].tolist()
        self.embeddings = self._embed(texts)
        self.embedding_dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(self.embedding_dim)
        self.index.add(self.embeddings)
        return self

    def search(self, query: str, top_k: int = 5):
        if self.index is None:
            raise ValueError("Индекс пуст. Сначала вызови build().")

        q_emb = self._embed([query])
        distances, indices = self.index.search(q_emb, top_k)

        results = []
        for dist, idx in zip(distances[0], indices[0]):
            row = self.chunks_df.iloc[int(idx)]
            results.append({
                "distance": float(dist),
                "chunk_id": int(row["chunk_id"]),
                "text_chunk": row["text_chunk"],
                "article": {
                    "id": row["id"],
                    "title": row["title"],
                    "authors": row["authors"],
                    "categories": row["categories"],
                    "published": row["published"],
                    "abstract": row["abstract"],
                    "pdf_url": row["pdf_url"]
                }
            })

        return results


In [9]:
from google import genai


class LLMClient:

    def __init__(self, api_key: str, model: str = "gemini-2.5-flash"):
        self.client = genai.Client(api_key=api_key)
        self.model = model

    def ask(self, prompt: str) -> str:

        response = self.client.models.generate_content(
            model=self.model,
            contents=f'"role": "user", "content": "{prompt}"'
        )
        return response.text


In [10]:
class ArxivQA:

    def __init__(self, vector_pipeline, llm_client, top_k=5):
        self.vec = vector_pipeline
        self.llm = llm_client
        self.top_k = top_k

    def answer(self, query: str) -> str:
        hits = self.vec.search(query, top_k=self.top_k)
        context = "\n\n".join(chunk["text_chunk"] for chunk in hits)

        prompt = f"""
You are a smart assistant. Here is the context from scientific articles:

{context}

Now answer the user's question:
{query}

Answer clearly and concisely.
        """

        return self.llm.ask(prompt)


In [11]:
from rapidfuzz import process, fuzz
import numpy as np


class SearchEngine:

    def __init__(self, vector_pipeline, llm_client):
        self.vec = vector_pipeline          # ArxivVectorPipeline
        self.llm = llm_client               # LLMClient
        self._build_spell_corpus()

    def _build_spell_corpus(self):
        words = set()
        for txt in self.vec.chunks_df["text_chunk"]:
            for w in txt.lower().split():
                if w.isalpha():
                    words.add(w)
        self.corpus_words = list(words)

    def correct_query(self, query: str) -> str:
        corrected = []
        for w in query.split():
            best = process.extractOne(w, self.corpus_words, scorer=fuzz.ratio)
            if best and best[1] > 70:
                corrected.append(best[0])
            else:
                corrected.append(w)
        return " ".join(corrected)

    def vector_search(self, query: str, top_k: int = 5):
        return self.vec.search(query, top_k)

    def build_prompt(self, query: str, context: str, template=None):
        if template is None:
            template = """
User query: "{query}"

Use ONLY this context:
-----------------
{context}
-----------------

Answer clearly and factually.
"""
        return template.format(query=query, context=context)

    def ask_rag(self, query: str, template=None, top_k=5):
        corrected = self.correct_query(query)
        hits = self.vector_search(corrected, top_k)
        context = "\n\n".join(h["text_chunk"] for h in hits)
        prompt = self.build_prompt(corrected, context, template)
        return self.llm.ask(prompt)

    def ask_vanilla(self, query: str):
        return self.llm.ask(query)


In [12]:
fetcher = ArxivTopicFetcher()
articles = fetcher.fetch_by_category("machine_learning", max_results=2000)
fetcher.print_summary(articles, n=3)

Запрос: cat:cs.LG
Получено статей: 2000

=== Короткий обзор ===

1. ThetaEvolve: Test-time Learning on Open Problems
   Авторы: Yiping Wang, Shao-Rong Su, Zhiyuan Zeng и др.
   Дата: 2025-11-28
   ID: 2511.23473v1
   Категории: cs.LG, cs.CL
   Абстракт: Recent advances in large language models (LLMs) have enabled breakthroughs in mathematical discovery, exemplified by AlphaEvolve, a closed-source system that evolves programs to improve bounds on open ...

2. SmallWorlds: Assessing Dynamics Understanding of World Models in Isolated Environments
   Авторы: Xinyi Li, Zaishuo Xia, Weyl Lu и др.
   Дата: 2025-11-28
   ID: 2511.23465v1
   Категории: cs.LG
   Абстракт: Current world models lack a unified and controlled setting for systematic evaluation, making it difficult to assess whether they truly capture the underlying rules that govern environment dynamics. In ...

3. The Price of Progress: Algorithmic Efficiency and the Falling Cost of AI Inference
   Авторы: Hans Gundlach, Jayson Lync

In [13]:
chunker = TextChunker(chunk_size=150, overlap=15)

df = chunker.to_dataframe(articles)

print(df.head())
print(df["chunks"].iloc[0][:2])

             id                                              title  \
0  2511.23473v1   ThetaEvolve: Test-time Learning on Open Problems   
1  2511.23465v1  SmallWorlds: Assessing Dynamics Understanding ...   
2  2511.23455v1  The Price of Progress: Algorithmic Efficiency ...   
3  2511.23449v1  Physics-Informed Neural Networks for Thermophy...   
4  2511.23443v1  Provable Benefits of Sinusoidal Activation for...   

                                             authors   published  \
0  Yiping Wang, Shao-Rong Su, Zhiyuan Zeng, Eva X...  2025-11-28   
1  Xinyi Li, Zaishuo Xia, Weyl Lu, Chenjie Hao, Y...  2025-11-28   
2  Hans Gundlach, Jayson Lynch, Matthias Mertens,...  2025-11-28   
3                         Ali Waseem, Malcolm Mielle  2025-11-28   
4                         Tianlong Huang, Zhiyuan Li  2025-11-28   

                   categories                             pdf_url  \
0                cs.LG, cs.CL  https://arxiv.org/pdf/2511.23473v1   
1                       cs.LG  h

In [14]:
stats = chunker.chunk_statistics(df)
print(stats)


=== Chunking Statistics ===
Total articles: 2000
Total chunks: 3688
Min chunks per article: 1
Max chunks per article: 3
Mean chunks per article: 1.844
Median chunks per article: 2.0
{'Total articles': 2000, 'Total chunks': 3688, 'Min chunks per article': 1, 'Max chunks per article': 3, 'Mean chunks per article': 1.844, 'Median chunks per article': 2.0}


In [18]:
# Full pipeline
pipeline = ArxivVectorPipeline(device="cpu")
pipeline.build(df)

results = pipeline.search("reinforcement learning for robots", top_k=5)

for r in results:
    print("\n---")
    print("Distance:", r["distance"])
    print("Chunk:", r["text_chunk"][:200], "…")
    print("Article title:", r["article"]["title"])
    print("ID:", r["article"]["id"])


---
Distance: 0.8626267910003662
Chunk: approaches may hold promise in solving offline reinforcement learning using continuous-time optimal control. …
Article title: Operator Models for Continuous-Time Offline Reinforcement Learning
ID: 2511.10383v1

---
Distance: 0.8880864381790161
Chunk: We study how vision-language-action (VLA) models can improve through real-world deployments via reinforcement learning (RL). We present a general-purpose method, RL with Experience and Corrections via …
Article title: $π^{*}_{0.6}$: a VLA That Learns From Experience
ID: 2511.14759v2

---
Distance: 0.9049038290977478
Chunk: We propose a refinement of temporal-difference learning that enforces first-order Bellman consistency: the learned value function is trained to match not only the Bellman targets in value but also the …
Article title: First-order Sobolev Reinforcement Learning
ID: 2511.19165v1

---
Distance: 0.9419611096382141
Chunk: The field of Offline Reinforcement Learning (RL) aims to derive 

In [19]:
llm = LLMClient(api_key="AIzaSyAQDkTa1BshAi4NMnDTasTZgbmijWdBA8w")

summary = llm.ask(
    f"Сделай короткий хардкорный конспект статьи: {articles[0]['abstract']}"
)

print(summary)


**ThetaEvolve: Хардкорный конспект**

ThetaEvolve — открытый фреймворк для математических открытий, расширяющий и упрощающий закрытую систему AlphaEvolve.

**Ключевые инновации и отличия:**
*   **RL во время инференса (RL at test-time):** Позволяет одной LLM непрерывно обучаться и интернализировать стратегии улучшения открытых оптимизационных задач, в отличие от чисто инференсного AlphaEvolve.
*   **Архитектура:** Использует **одну LLM** (вместо ансамблей), что значительно упрощает систему.
*   **Масштабирование и эффективность:**
    *   Большая база программ для расширенной эксплорации.
    *   Пакетная выборка для высокой пропускной способности.
    *   "Ленивые" штрафы для предотвращения стагнации.
    *   Формирование вознаграждения для стабильного обучения RL.

**Результаты:**
*   **Прорыв:** Первая открытая система, позволившая **небольшой открытой LLM** (DeepSeek-R1-0528-Qwen3-8B) установить **новые, лучшие известные границы** для открытых математических проблем (упаковка круго

In [21]:
# 4. LLM
llm_model = LLMClient(api_key="AIzaSyAQDkTa1BshAi4NMnDTasTZgbmijWdBA8w")

# 5. QA система
qa = ArxivQA(pipeline, llm_model)

# 6. Вопрос
ans = qa.answer("How do data augmentation techniques improve the generalization of machine learning models?")
print(ans)


Data augmentation techniques improve the generalization of machine learning models by:

1.  **Increasing the effective size and diversity of the training data:** This exposes the model to a wider range of variations (e.g., jittering, scaling, warping), helping it learn more robust features.
2.  **Fostering noise invariance:** By exposing the model to perturbed data, it learns to be less sensitive to minor variations and noise, making it more robust.
3.  **Preserving invariant relationships and governing processes:** Guided augmentation ensures that the model learns fundamental patterns rather than spurious local ones, allowing it to recover original processes from augmented data.
4.  Ultimately, this leads to improved predictions on completely unseen regions and makes the model's evaluated performance more reflective of its true generalization capabilities.


In [22]:
search = SearchEngine(pipeline, llm_model)

resp = search.ask_rag("reinforment learnig for robtos")
print(resp)

Reinforcement learning (RL) is used to improve Vision-Language-Action (VLA) models through real-world deployments. A method named RL with Experience and Corrections via Advantage-conditioned Policies (RECAP) provides for RL training of VLAs via advantage conditioning.

RECAP incorporates heterogeneous data, including demonstrations, data from on-policy collection, and expert teleoperated interventions, into the self-improvement process. It begins by pre-training a generalist VLA with offline RL, referred to as $π^{*}_{0.6}$, which can then be specialized for downstream tasks through on-robot data collection.

The $π^{*}_{0.6}$ model, when trained with the full RECAP method, has demonstrated the ability to:
*   Fold laundry in real homes.
*   Reliably assemble boxes.
*   Make espresso drinks using a professional espresso machine.

On some of the most challenging tasks, RECAP has been shown to more than double task throughput and roughly halve the task failure rate.


In [23]:
q = "What are advancements in model-based RL?"

print("=== Vanilla LLM ===")
print(search.ask_vanilla(q))

print("\n=== RAG ===")
print(search.ask_rag(q))


=== Vanilla LLM ===
Model-based Reinforcement Learning (MBRL) has seen significant advancements in recent years, addressing many of its historical challenges and making it competitive with, and in many cases superior to, model-free methods, especially in terms of sample efficiency.

The core idea of MBRL is to learn a model of the environment's dynamics (how states change given actions) and/or rewards, and then use this model for planning, policy improvement, or generating synthetic experience.

Here are the key advancements:

1.  **Learning More Powerful and Accurate Models (Deep Dynamics Models):**
    *   **Neural Network-based Dynamics:** Replacing traditional tabular or simple linear models with deep neural networks (DNNs) has allowed MBRL to tackle high-dimensional, complex environments (e.g., from raw pixels). Recurrent Neural Networks (RNNs), Transformers, and other sequence models are now commonly used to predict future states from sequences of past observations and actions.
 

In [24]:
prompt_1 = """
Provide a scientific explanation grounded strictly in the supplied corpus.

Query: "{query}"

Base your answer ONLY on the information in:
{context}

If details are absent in the corpus, respond that no relevant evidence is present.
"""
print(search.ask_rag("how transformer-based models compress high-dimensional embeddings", template=prompt_1))


The provided corpus suggests that "meaningful computations reside in compact subspaces." This implies that within transformer models, computations might effectively operate on representations that are reduced in dimensionality or are more concise.

However, the corpus does not explicitly detail the specific mechanisms of how transformer models compress dimensional embeddings.


In [25]:
prompt_2 = """
Your task is to extract factual statements from the given context
and use only those facts to answer the user’s question.

Question: "{query}"

Relevant extracted facts must come solely from:
{context}

Do not infer or extend beyond what is explicitly stated.
"""
print(search.ask_rag("why normalization of embeddings affects similarity search accuracy", template=prompt_2))


Based on the provided text, the specific reason *why* normalization of embeddings affects similarity search accuracy is not explicitly stated.

However, the text does state the following related facts:
*   Current text embedding models produce outputs with a consistent bias, $μ$, across all sentences.
*   Renormalization is a solution that involves subtracting this bias ($μ$) from the embedding vector $e$.
*   Renormalization consistently and statistically significantly improves the performance of existing models on retrieval tasks (a type of similarity search).


In [26]:
prompt_3 = """
Explain the answer with simple analogies suitable for a newcomer,
but rely strictly on data from the context.

Question: "{query}"

Context to use:
{context}

If the context lacks information needed for the answer,
state that the corpus does not cover this topic.
"""
print(search.ask_rag("trade-offs between large pretrained models and lightweight fine-tuned models", template=prompt_3))


Imagine you have two types of smart helpers:

**1. The "Encyclopedic Generalist" (Large Pretrained Models):**

*   **What it is:** Think of this as a vast, highly knowledgeable professor who has studied almost everything (it has "strong reasoning and tool-use skills"). It knows a little bit about every topic.
*   **Tradeoffs:**
    *   **Pro:** It's very general and can understand many different types of problems.
    *   **Con (Size & Cost):** This professor is so big and takes up so much space in your office (high "computational demands") that it's "impractical for edge or cost-sensitive deployments." You can't just put it on a small phone or device.
    *   **Con (Specialized Performance):** Even though it's huge, on very specific tasks (like medical operations for hospitals), smaller, specialized helpers can "outperform" it, sometimes by a lot (up to 70x larger generalist models were outperformed by Lang1-1B).
    *   **Con (Misalignment Risk):** If you try to teach this big profes

In [27]:
prompt_4 = """
Compose a brief analytical report (3–4 sentences)
based exclusively on the provided material.

Question: "{query}"

Use ONLY the content below:
{context}

Avoid adding external knowledge or assumptions.
"""
print(search.ask_rag("how synthetic data generation impacts model generalization", template=prompt_4))


Synthetic Data Generation (SDG) positively impacts model generalization, particularly in scenarios with scarce or poor-quality real-world data, which otherwise lead to poor generalization. Fine-tuning models on synthetic data can enable even smaller models to become universal structured generation models that rival the performance of larger counterparts. Advanced methods like Causal Generative Models (CGMs) enhance generalization by creating synthetic datasets that preserve underlying causal relationships, leading to more reliable models. This directly addresses the issue of models being prone to poor generalization and failing in real-world scenarios due to a lack of representative data.
