In [1]:
import io
import zipfile
import requests
import frontmatter

doc_extensions = {'md', 'mdx'}
code_extensions = {'py', 'sql', 'java', 'ipynb'}
extensions = doc_extensions | code_extensions

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown and code files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """ 
    url = f'https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/main.zip'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filepath = file_info.filename
        filepath_lower = filepath.lower()

        if filepath_lower.endswith('/'):
            continue

        filename = filepath_lower.split('/')[-1]

        if filename.startswith('.'):
            continue

        ext = filename.split('.')[-1]

        if ext not in extensions:
            continue

        filepath_edited = filepath.split('/', maxsplit=1)[1]

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                if ext in doc_extensions:
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filepath_edited
                elif ext in code_extensions:
                    data = {
                        'code': True,
                        'content': content,
                        'filename': filepath_edited
                    }

                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [2]:
autogen_data = read_repo_data('microsoft', 'autogen')

# This will print the number of files you successfully processed
print(f'Processed {len(autogen_data)} files from microsoft/autogen.')

Processed 749 files from microsoft/autogen.


In [3]:
index = {}

for record in autogen_data:
    index[record['filename']] = record

In [4]:
import nbformat
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import ClearOutputPreprocessor

exporter = MarkdownExporter()
exporter.register_preprocessor(ClearOutputPreprocessor(), enabled=True)

def format_notebook_as_md(raw_notebook: str) -> str:
    nb_parsed = nbformat.reads(
        raw_notebook,
        as_version=nbformat.NO_CONVERT,
    )
    md_body, _ = exporter.from_notebook_node(nb_parsed)
    return md_body

In [5]:
def strip_code_fence(text: str) -> str:
    text = text.strip()

    if not text.startswith("```"):
        return text

    lines = text.splitlines()
    lines = lines[1:]

    if lines and lines[-1].strip() == "```":
        lines = lines[:-1]

    return "\n".join(lines)

In [10]:
from dotenv import load_dotenv
from openai import OpenAI

# This line loads the variables from your .env file
load_dotenv()

# Now the OpenAI client will be able to find the key in your environment
openai_client = OpenAI()

# You can also explicitly pass the key if you prefer
# import os
# openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [11]:
def llm(instructions, content, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": content}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages,
    )

    return response.output_text

In [12]:
notebook_editing_instructions = """
You're a professional coding editor.

You are given a Markdown file that was converted from a Jupyter notebook.  
The file already contains code blocks and inline comments.  

Your task:

- Turn it into clear, well-structured documentation.  
- Add section headers (##) where appropriate. Keep sections relatively large (8-10 paragraphs and code blocks)
- Add concise, high-level explanations for each code block.  
- Summarize what the code is doing without being overly verbose.  
- Keep the formatting in Markdown.
- Aim for a balance: clear enough to guide someone new, but not overloaded with detail. 

Output the improved Markdown file with the new documentation.
""".strip()

code_doc_instructions = """
You are given a piece of source code.  

Your task:  
- Analyze the code and produce a clear, high-level description of what it does.  
- If the code defines functions, methods, or classes, describe their purpose and role.  
- If it’s just a script without explicit functions/classes, summarize what the script does step by step at a high level.  
- Add logical sections or headings (##) if needed. Sections must be relatively large (8-10 paragraphs and code blocks)
- Keep explanations concise and clear — avoid unnecessary verbosity.  
- Output the result in Markdown, structured like documentation.  
- Do not rewrite or modify the code itself, only provide descriptive documentation.
""".strip()

In [13]:
# # First, open the file and read its content
# with open('data-processing-code.ipynb', 'r', encoding='utf-8') as f:
#     raw_notebook_content = f.read()

# result = llm(notebook_editing_instructions, md_body)
# print(result)
# #result = llm(system_prompt, md_body)

In [14]:
from tqdm.auto import tqdm

In [15]:
ipynb_data = []

for record in autogen_data:
    if record.get('code') == True and record['filename'].endswith('.ipynb'):
        ipynb_data.append(record)


print(f'processing {len(ipynb_data)} jupyter notebooks...')

for record in tqdm(ipynb_data):
    md_body = format_notebook_as_md(record['content'])
    new_content = llm(notebook_editing_instructions, md_body)
    new_content = strip_code_fence(new_content)
    record['content'] = new_content
    record['code'] = False

processing 49 jupyter notebooks...


  0%|          | 0/49 [00:00<?, ?it/s]

In [16]:
code_data = []

for record in autogen_data:
    if record.get('code') != True:
        continue

    path = record['filename']
    ext = path.split('.')[-1]

    if ext not in code_extensions:
        continue

    if ext == 'ipynb':
        continue

    # print(path)
    code_data.append(record)

print(f'processing {len(code_data)} code files...')

processing 539 code files...


In [17]:
for record in tqdm(code_data):
    code = record['content']

    new_content = llm(code_doc_instructions, code)
    new_content = strip_code_fence(new_content)

    record['content'] = new_content
    record['code'] = False

  0%|          | 0/539 [00:00<?, ?it/s]

In [18]:
import json

In [19]:
!mkdir data

In [21]:
output_file = 'data/autogen_data_processed.json'

with open(output_file, 'w', encoding='utf-8') as f_out:
    json.dump(autogen_data, f_out, indent=2)

In [22]:
!head data/autogen_data_processed.json

[
  {
    "content": "<!-- Thank you for your contribution! Please review https://microsoft.github.io/autogen/docs/Contribute before opening a pull request. -->\n\n<!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. -->\n\n## Why are these changes needed?\n\n<!-- Please give a short summary of the change and the problem this solves. -->\n\n## Related issue number\n\n<!-- For example: \"Closes #1234\" -->\n\n## Checks\n\n- [ ] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally.\n- [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR.\n- [ ] I've made sure all auto checks have passed.",
    "filename": ".github/PULL_REQUEST_TEMPLATE.md"
  },
  {
    "filename": ".github/copilot-instructions.md"
  },
  {


In [23]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [24]:
autogen_data_chunks = []

for doc in autogen_data:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    autogen_data_chunks.extend(chunks)

In [25]:
len(autogen_data_chunks)

3046

In [26]:
autogen_data_chunks[100]

{'start': 0,
 'chunk': "## Prerequisites\n\n- Access to gpt3.5-turbo or preferably gpt4 - [Get access here](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#how-do-i-get-access-to-azure-openai)\n- [Setup a Github app](#how-do-i-setup-the-github-app)\n- [Install the Github app](https://docs.github.com/en/apps/using-github-apps/installing-your-own-github-app)\n- [Provision the azure resources](#how-do-I-deploy-the-azure-bits)\n- [Create labels for the dev team skills](#which-labels-should-i-create)\n\n### How do I setup the Github app?\n\n- [Register a Github app](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app), with the options listed below:\n    - Give your App a name and add a description\n    - Homepage URL: Can be anything (Example: repository URL)\n    - Add a dummy value for the webhook url, we'll come back to this setting\n    - Enter a webhook secret, which you'll need later on when filling in the `Webhoo