In [1]:
import io
import zipfile
import requests
import frontmatter

doc_extensions = {'md', 'mdx'}
code_extensions = {'py', 'sql', 'java', 'ipynb'}
extensions = doc_extensions | code_extensions

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown and code files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """ 
    url = f'https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/main.zip'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filepath = file_info.filename
        filepath_lower = filepath.lower()

        if filepath_lower.endswith('/'):
            continue

        filename = filepath_lower.split('/')[-1]

        if filename.startswith('.'):
            continue

        ext = filename.split('.')[-1]

        if ext not in extensions:
            continue

        filepath_edited = filepath.split('/', maxsplit=1)[1]

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                if ext in doc_extensions:
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filepath_edited
                elif ext in code_extensions:
                    data = {
                        'code': True,
                        'content': content,
                        'filename': filepath_edited
                    }

                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [12]:
autogen_data = read_repo_data('microsoft', 'autogen')

# This will print the number of files you successfully processed
print(f'Processed {len(autogen_data)} files from microsoft/autogen.')

Processed 749 files from microsoft/autogen.


In [3]:
index = {}

for record in autogen_data:
    index[record['filename']] = record

In [4]:
import nbformat
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import ClearOutputPreprocessor

exporter = MarkdownExporter()
exporter.register_preprocessor(ClearOutputPreprocessor(), enabled=True)

def format_notebook_as_md(raw_notebook: str) -> str:
    nb_parsed = nbformat.reads(
        raw_notebook,
        as_version=nbformat.NO_CONVERT,
    )
    md_body, _ = exporter.from_notebook_node(nb_parsed)
    return md_body

In [5]:
def strip_code_fence(text: str) -> str:
    text = text.strip()

    if not text.startswith("```"):
        return text

    lines = text.splitlines()
    lines = lines[1:]

    if lines and lines[-1].strip() == "```":
        lines = lines[:-1]

    return "\n".join(lines)

In [10]:
from dotenv import load_dotenv
from openai import OpenAI

# This line loads the variables from your .env file
load_dotenv()

# Now the OpenAI client will be able to find the key in your environment
openai_client = OpenAI()

# You can also explicitly pass the key if you prefer
# import os
# openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [11]:
def llm(instructions, content, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": content}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages,
    )

    return response.output_text

In [12]:
notebook_editing_instructions = """
You're a professional coding editor.

You are given a Markdown file that was converted from a Jupyter notebook.  
The file already contains code blocks and inline comments.  

Your task:

- Turn it into clear, well-structured documentation.  
- Add section headers (##) where appropriate. Keep sections relatively large (8-10 paragraphs and code blocks)
- Add concise, high-level explanations for each code block.  
- Summarize what the code is doing without being overly verbose.  
- Keep the formatting in Markdown.
- Aim for a balance: clear enough to guide someone new, but not overloaded with detail. 

Output the improved Markdown file with the new documentation.
""".strip()

code_doc_instructions = """
You are given a piece of source code.  

Your task:  
- Analyze the code and produce a clear, high-level description of what it does.  
- If the code defines functions, methods, or classes, describe their purpose and role.  
- If it’s just a script without explicit functions/classes, summarize what the script does step by step at a high level.  
- Add logical sections or headings (##) if needed. Sections must be relatively large (8-10 paragraphs and code blocks)
- Keep explanations concise and clear — avoid unnecessary verbosity.  
- Output the result in Markdown, structured like documentation.  
- Do not rewrite or modify the code itself, only provide descriptive documentation.
""".strip()

In [13]:
# # First, open the file and read its content
# with open('data-processing-code.ipynb', 'r', encoding='utf-8') as f:
#     raw_notebook_content = f.read()

# result = llm(notebook_editing_instructions, md_body)
# print(result)
# #result = llm(system_prompt, md_body)

In [14]:
from tqdm.auto import tqdm

In [15]:
ipynb_data = []

for record in autogen_data:
    if record.get('code') == True and record['filename'].endswith('.ipynb'):
        ipynb_data.append(record)


print(f'processing {len(ipynb_data)} jupyter notebooks...')

for record in tqdm(ipynb_data):
    md_body = format_notebook_as_md(record['content'])
    new_content = llm(notebook_editing_instructions, md_body)
    new_content = strip_code_fence(new_content)
    record['content'] = new_content
    record['code'] = False

processing 49 jupyter notebooks...


  0%|          | 0/49 [00:00<?, ?it/s]

In [16]:
code_data = []

for record in autogen_data:
    if record.get('code') != True:
        continue

    path = record['filename']
    ext = path.split('.')[-1]

    if ext not in code_extensions:
        continue

    if ext == 'ipynb':
        continue

    # print(path)
    code_data.append(record)

print(f'processing {len(code_data)} code files...')

processing 539 code files...


In [17]:
for record in tqdm(code_data):
    code = record['content']

    new_content = llm(code_doc_instructions, code)
    new_content = strip_code_fence(new_content)

    record['content'] = new_content
    record['code'] = False

  0%|          | 0/539 [00:00<?, ?it/s]

In [18]:
import json

In [19]:
!mkdir data

In [21]:
output_file = 'data/autogen_data_processed.json'

with open(output_file, 'w', encoding='utf-8') as f_out:
    json.dump(autogen_data, f_out, indent=2)

In [22]:
!head data/autogen_data_processed.json

[
  {
    "content": "<!-- Thank you for your contribution! Please review https://microsoft.github.io/autogen/docs/Contribute before opening a pull request. -->\n\n<!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. -->\n\n## Why are these changes needed?\n\n<!-- Please give a short summary of the change and the problem this solves. -->\n\n## Related issue number\n\n<!-- For example: \"Closes #1234\" -->\n\n## Checks\n\n- [ ] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally.\n- [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR.\n- [ ] I've made sure all auto checks have passed.",
    "filename": ".github/PULL_REQUEST_TEMPLATE.md"
  },
  {
    "filename": ".github/copilot-instructions.md"
  },
  {


In [23]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [24]:
autogen_data_chunks = []

for doc in autogen_data:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    autogen_data_chunks.extend(chunks)

In [25]:
len(autogen_data_chunks)

3046

In [26]:
autogen_data_chunks[100]

{'start': 0,
 'chunk': "## Prerequisites\n\n- Access to gpt3.5-turbo or preferably gpt4 - [Get access here](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#how-do-i-get-access-to-azure-openai)\n- [Setup a Github app](#how-do-i-setup-the-github-app)\n- [Install the Github app](https://docs.github.com/en/apps/using-github-apps/installing-your-own-github-app)\n- [Provision the azure resources](#how-do-I-deploy-the-azure-bits)\n- [Create labels for the dev team skills](#which-labels-should-i-create)\n\n### How do I setup the Github app?\n\n- [Register a Github app](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app), with the options listed below:\n    - Give your App a name and add a description\n    - Homepage URL: Can be anything (Example: repository URL)\n    - Add a dummy value for the webhook url, we'll come back to this setting\n    - Enter a webhook secret, which you'll need later on when filling in the `Webhoo

### Day 3

In [3]:
import json

input_file = 'data/autogen_data_processed.json'
autogen_data_chunk = None # Initialize the variable

try:
    # Open the file in 'r' (read) mode
    with open(input_file, 'r', encoding='utf-8') as f_in:
        # Use json.load() to read the JSON data from the file object (f_in)
        autogen_data_chunk = json.load(f_in)
        
    print(f"Successfully loaded data from {input_file}")
    # You can now work with autogen_data_chunk
    # print(type(autogen_data_chunk)) 
    # print(autogen_data_chunk)
    
except FileNotFoundError:
    print(f"Error: The file {input_file} was not found.")
except json.JSONDecodeError:
    print(f"Error: Failed to decode JSON from {input_file}. The file might be malformed.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded data from data/autogen_data_processed.json


#### TEXT SEARCH

In [4]:
from minsearch import Index

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(autogen_data_chunk)


<minsearch.minsearch.Index at 0x7fbfbf5e4fd0>

In [6]:
query = 'what does interface definitions and reference implementations of agent runtime, model, tool, workbench, memory, tracing'
query2 = 'tell me about packages/autogen-core'
results = index.search(query2)

In [7]:
# View the first 5 results
top_results = results[:5]

# Print the subset (if the results are already string/dict)
print(top_results)

[{'content': '# AutoGen Core\n\n- [Documentation](https://microsoft.github.io/autogen/stable/user-guide/core-user-guide/index.html)\n\nAutoGen core offers an easy way to quickly build event-driven, distributed, scalable, resilient AI agent systems. Agents are developed by using the [Actor model](https://en.wikipedia.org/wiki/Actor_model). You can build and run your agent system locally and easily move to a distributed system in the cloud when you are ready.', 'filename': 'python/packages/autogen-core/README.md'}, {'code': False, 'content': '# Overview of `autogen_core`\n\nThe `autogen_core` module serves as the foundation for a system centered around agents, interoperability, and component management. This module facilitates the creation and management of agents, provides tools for data serialization, defines message handling protocols, and manages logging functionalities. By importing various submodules and defining global constants and classes, it offers an organized layout for the i

#### VECTOR SEARCH
uv add sentence-transformers

In [9]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The multi-qa-distilbert-cos-v1 model is trained explicitly for question-answering tasks. It creates embeddings optimized for finding answers to questions.
Other popular models include:
* all-MiniLM-L6-v2 - General-purpose, fast, and efficient
* all-mpnet-base-v2 - Higher quality, slower
* Check Sentence Transformers documentation for more options.

In [None]:
autogen_data = read_repo_data('microsoft', 'autogen')

In [18]:
# Check the keys of the failing record
print(autogen_data[2].keys()) 

dict_keys(['content', 'filename'])


In [15]:
autogen_data = read_repo_data('microsoft', 'autogen')

#autogen_data = [d for d in dtc_faq if 'data-engineering' in d['filename']]

aut_index = Index(
    text_fields=["filename", "content"],
    keyword_fields=[]
)

aut_index.fit(autogen_data)


<minsearch.minsearch.Index at 0x7fbeb3439ff0>

In [19]:
record = autogen_data[2]
text = record['filename'] + ' ' + record['content'] 
v_doc = embedding_model.encode(text)


In [20]:
query = 'what checks must be met to run a PR?'
v_query = embedding_model.encode(query)


In [21]:
similarity = v_query.dot(v_doc)

In [24]:
from tqdm.auto import tqdm
import numpy as np

aut_embeddings = []

for d in tqdm(autogen_data):
    text = d['filename'] + ' ' + d['content']
    v = embedding_model.encode(text)
    aut_embeddings.append(v)

aut_embeddings = np.array(aut_embeddings)


  0%|          | 0/749 [00:00<?, ?it/s]

###### vector search usage

In [26]:
from minsearch import VectorSearch

aut_vindex = VectorSearch()
aut_vindex.fit(aut_embeddings, autogen_data)


<minsearch.vector.VectorSearch at 0x7fbe944a0700>

In [27]:
query = 'what checks must be met to run a PR?'
q = embedding_model.encode(query)
results = aut_vindex.search(q)


In [29]:
# Run this line to see what keys are actually available
if autogen_data_chunk:
    print("Available keys in the first document:")
    print(autogen_data_chunk[0].keys())
else:
    print("autogen_data_chunk is empty.")

Available keys in the first document:
dict_keys(['content', 'filename'])


In [30]:
autogen_embeddings = []

for d in tqdm(autogen_data_chunk):
    v = embedding_model.encode(d['content'])
    autogen_embeddings.append(v)

autogen_embeddings = np.array(autogen_embeddings)

autogen_vindex = VectorSearch()
autogen_vindex.fit(autogen_embeddings, autogen_data_chunk)


  0%|          | 0/749 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x7fbe303dfdf0>

#### HYBRID SEARCH

In [33]:
# 1. Text Search (Keyword Matching) - Uses the text-based index
query = 'what checks must be met to run a PR?'
text_results = aut_index.search(query, num_results=5) 

# 2. Vector Search (Semantic Matching) - Uses the vector-based index
q = embedding_model.encode(query)
# --- FIX IS HERE ---
vector_results = autogen_vindex.search(q, num_results=5) 

# 3. Combine results
final_results = text_results + vector_results

print(f"Hybrid search returned {len(final_results)} results.")

Hybrid search returned 10 results.


In [34]:
final_results = final_results[:5]
print(final_results)

[{'content': '<!-- Thank you for your contribution! Please review https://microsoft.github.io/autogen/docs/Contribute before opening a pull request. -->\n\n<!-- Please add a reviewer to the assignee section when you create a PR. If you don\'t have the access to it, we will shortly find a reviewer and assign them to your PR. -->\n\n## Why are these changes needed?\n\n<!-- Please give a short summary of the change and the problem this solves. -->\n\n## Related issue number\n\n<!-- For example: "Closes #1234" -->\n\n## Checks\n\n- [ ] I\'ve included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally.\n- [ ] I\'ve added tests (if relevant) corresponding to the changes introduced in this PR.\n- [ ] I\'ve made sure all auto checks have passed.', 'filename': '.github/PULL_REQUEST_TEMPLATE.md'}, {'content': '# Contributing\n\nThe project welcomes contributions from devel

#### PUTTING IT ALL TOGETHER

In [35]:
def text_search(query):
    return aut_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return aut_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results


### Day 4

#### No Tool

In [3]:
from dotenv import load_dotenv
from openai import OpenAI

# This line loads the variables from your .env file
load_dotenv()

# Now the OpenAI client will be able to find the key in your environment
openai_client = OpenAI()

#user_prompt = "I just discovered the course, can I join now?"
user_prompt = 'what checks must be met to run a PR?'

chat_messages = [
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
)

print(response.output_text)

To run a pull request (PR), several checks and criteria typically need to be met. These can vary by organization, workflow, or version control system, but common checks include:

1. **Code Review**: At least one (or more) reviewers must approve the changes in the PR.

2. **Continuous Integration (CI) Checks**: Automated tests must pass, including:
   - Unit tests
   - Integration tests
   - End-to-end tests

3. **Static Code Analysis**: Code quality checks must be conducted, including:
   - Linter checks
   - Security scans
   - Code formatting

4. **Branch Policies**: The PR needs to comply with branch protection rules, such as:
   - No direct commits to the main branch
   - Mandatory PR for merging changes

5. **Conflict Resolution**: The PR must be free of merge conflicts with the target branch.

6. **Documentation Updates**: If changes require updates to documentation, those should be included or noted.

7. **Commit Standards**: Commits should follow a specific message format or sq

#### Function Calling with OpenAI

Cannot pass this function to openai llm hence we describe the capability of the function using json

from minsearch import Index

aut_index = Index(
    text_fields=["filename", "content"],
    keyword_fields=[]
)

def text_search(query):
    return aut_index.search(query, num_results=5)

In [7]:

text_search_tool = {
    "type": "function",
    "name": "text_search",
    "description": "Search the autogen github repository",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the autogen github repository."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}

system_prompt = """
You are a helpful assistant for a course. 
"""

#question = 'what checks must be met to run a PR?'
question = 'what checks must be met to run a PR in the **AutoGen** repository?'
# OR
# question3 = 'How do I check the build status for a PR in AutoGen?'

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool] # TOOL
)
print(response.output)

[ResponseFunctionToolCall(arguments='{"query":"PR checks AutoGen repository"}', call_id='call_ckhE8MhfepyMexexjHHVldQ0', name='text_search', type='function_call', id='fc_68d8699d5e10819f96df2ef3cadf8aa30e6f59c642a49773', status='completed')]


invoke the function

In [10]:
import json
from minsearch import Index


call = response.output[0]

aut_index = Index(
    text_fields=["filename", "content"],
    keyword_fields=[]
)

def text_search(query):
    return aut_index.search(query, num_results=5)
    
arguments = json.loads(call.arguments)
result = text_search(**arguments)

call_output = {
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": json.dumps(result),
}


In [11]:
chat_messages.append(call)
chat_messages.append(call_output)

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)

print(response.output_text)

It seems I couldn't find specific information about the checks required to run a pull request (PR) in the AutoGen repository. However, common checks for PRs in many repositories typically include:

1. **Code Review**: Ensuring that other developers review the code changes.
2. **Automated Tests**: Running unit tests and integration tests to verify that the changes do not break existing functionality.
3. **Linting**: Code style checks to enforce style guidelines and best practices.
4. **CI/CD Pipeline**: Successful completion of continuous integration and deployment processes.
5. **Documentation**: Updates to any relevant documentation that may be affected by the changes.
6. **Issue Link**: Reference to a related issue in the repository to provide context for the changes.

For accurate and detailed information, it's best to check the AutoGen repository's CONTRIBUTING.md or relevant documentation. Would you like help finding or reviewing those specific files?


In [12]:
system_prompt = """
You are a helpful assistant for a course. 

Always search for relevant information before answering. 
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers.
"""


In [14]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return aut_index.search(query, num_results=5)

In [15]:
from pydantic_ai import Agent

agent = Agent(
    name="aut_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)


In [16]:
#question = 'what checks must be met to run a PR?'
question = 'what checks must be met to run a PR in the **AutoGen** repository?'
# OR
# question3 = 'How do I check the build status for a PR in AutoGen?'

result = await agent.run(user_prompt=question)

In [17]:
print(result)

AgentRunResult(output="It seems I wasn't able to find specific information regarding the checks that must be met to run a pull request (PR) in the AutoGen repository through a search in the available resources. \n\nFor the most accurate and up-to-date information, I recommend checking the AutoGen repository directly on GitHub. Typically, PRs may have checks related to:\n\n1. **Code Review**: A certain number of approvals from maintainers or team members.\n2. **Continuous Integration**: Automated tests must pass (e.g., unit tests, integration tests).\n3. **Code Style**: Adherence to coding standards and formatting checks.\n4. **Documentation**: Any changes may require updates to documentation.\n5. **Issue Linking**: Linked issues or features that the PR addresses.\n\nIf specific checks are implemented in the AutoGen repository, they would usually be detailed in the repository's contribution guidelines or README file. Would you like assistance with something else?")


In [18]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='what checks must be met to run a PR in the **AutoGen** repository?', timestamp=datetime.datetime(2025, 9, 27, 22, 55, 40, 445625, tzinfo=datetime.timezone.utc))], instructions="You are a helpful assistant for a course. \n\nAlways search for relevant information before answering. \nIf the first search doesn't give you enough information, try different search terms.\n\nMake multiple searches if needed to provide comprehensive answers."),
 ModelResponse(parts=[ToolCallPart(tool_name='text_search', args='{"query":"AutoGen repository PR checks"}', tool_call_id='call_kHtsYNArGwL1xkIY0Z786BQ7')], usage=RequestUsage(input_tokens=152, output_tokens=18, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}), model_name='gpt-4o-mini-2024-07-18', timestamp=datetime.datetime(2025, 9, 27, 22, 55, 42, tzinfo=TzInfo(UTC)), provider_name='openai', provider_details={'finish_reason': 'tool_calls'},

#### HYBRID SEARCH

In [21]:
import json
from minsearch import Index, VectorSearch
from sentence_transformers import SentenceTransformer
from typing import List, Any
import numpy as np
from tqdm.auto import tqdm # Import tqdm for progress bar

# Assuming read_repo_data is defined elsewhere
# (Note: This is the data loading step)
autogen_data = read_repo_data('microsoft', 'autogen')

def sliding_window(seq, size, step):
    """Chunks text into overlapping sections."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk_content = seq[i:i+size]
        # Stores the chunked text under the 'content' key
        result.append({'start': i, 'content': chunk_content}) 
        if i + size >= n:
            break
    return result

# --- Data Preparation and Chunking ---
autogen_data_chunks = []

for doc in autogen_data:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        # Merge chunk content with document metadata (filename, etc.)
        chunk.update(doc_copy) 
    autogen_data_chunks.extend(chunks)

# --- 1. Text Search Index (minsearch Index) ---
# FIX: Define and fit the text index
aut_index = Index(
    text_fields=["filename", "content"],
    keyword_fields=[]
)
aut_index.fit(autogen_data_chunks)

# --- 2. Vector Search Index (minsearch VectorSearch) ---
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')
autogen_embeddings = []

# FIX: Use the correct list name: autogen_data_chunks
for d in tqdm(autogen_data_chunks):
    # d['content'] is the correct key holding the chunked text
    v = embedding_model.encode(d['content'])
    autogen_embeddings.append(v)

autogen_embeddings = np.array(autogen_embeddings)

autogen_vindex = VectorSearch()
# FIX: Use the correct list name: autogen_data_chunks
autogen_vindex.fit(autogen_embeddings, autogen_data_chunks)


# --- HYBRID SEARCH FUNCTION (The Agent's Tool) ---
def hybrid_search(query: str) -> List[Any]:
    """
    Performs a Hybrid Search combining Text (Keyword) and Vector (Semantic) search.
    This is the function the agent will call.
    """
    # 1. Text Search (Keyword matching)
    text_results = aut_index.search(query, num_results=5)
    
    # 2. Vector Search (Semantic matching)
    q_vector = embedding_model.encode(query)
    vector_results = autogen_vindex.search(q_vector, num_results=5)
    
    # 3. Combine and Deduplicate Results
    seen_filenames = set()
    combined_results = []

    for result in text_results + vector_results:
        # Deduplicate based on 'filename'
        if result['filename'] not in seen_filenames:
            seen_filenames.add(result['filename'])
            # Return only the essential information for the LLM
            combined_results.append({
                "filename": result['filename'],
                "content": result['content']
            })

    return combined_results


# --- AGENT SETUP AND EXECUTION ---

system_prompt = """
You are a helpful assistant for the AutoGen course.
Your sole purpose is to answer questions using the 'hybrid_search' tool, which queries the official AutoGen documentation.
Always use the tool before answering.
"""

# Assuming pydantic_ai and Agent are defined/imported correctly
from pydantic_ai import Agent

agent = Agent(
    name="autogen_agent",
    instructions=system_prompt,
    # FIX: Pass the HYBRID search function as the tool
    tools=[hybrid_search],
    model='gpt-4o-mini'
)

# Use the specific, enforced question to trigger the tool call
question = 'what checks must be met to run a PR in the **AutoGen** repository?'

result = await agent.run(user_prompt=question)
print(result)

  0%|          | 0/6951 [00:00<?, ?it/s]

AgentRunResult(output='To run a pull request (PR) in the AutoGen repository, the following checks must be met:\n\n1. **Documentation Changes**: You should include any necessary documentation changes for the project. Reference the [documentation](https://microsoft.github.io/autogen/) and ensure you can build and test it locally as described in the [CONTRIBUTING guide](https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md).\n\n2. **Tests**: If your changes introduce any new functionality or modify existing behavior, you should add corresponding tests.\n\n3. **All Auto Checks**: You must make sure that all automated checks have passed before the PR can be merged.\n\nThese checks ensure a certain level of quality and maintainability in the codebase before changes are integrated.')
