In [1]:
# Data preprocessing
import io
import zipfile
import requests
import frontmatter
# Data chunking
from openai import OpenAI
import re
from tqdm.auto import tqdm
# Search
from minsearch import Index
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np
from minsearch import VectorSearch

In [2]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    postfix = 'zip/refs/heads/main'
    url = f'{prefix}/{repo_owner}/{repo_name}/{postfix}'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    
    return repository_data

In [3]:
pytorch_docs = read_repo_data('pytorch', 'pytorch')

In [4]:
print(f"PyTorch documents: {len(pytorch_docs)}")

PyTorch documents: 322


In [5]:
def sliding_window(seq, size, step):
    """
    The following method creates a sliding window chunk based on a given sequence, based on size + step.
        
    :param str: Markdown text as a string
    :param size: the size of a batch in that seq
    :param step: a step, where if step = size, there's 0 overlap, and if step < size, there will be an overlap of "step"
    
    :return: List of chunks as strings
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        temp = {'start': i, 'chunk': chunk}
        result.append(temp)
        if i + size >= n:
            break

    return result

In [6]:
pytorch_chunks = []

for doc in pytorch_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 1000, 800)
    for chunk in chunks:
        chunk.update(doc_copy)
    pytorch_chunks.extend(chunks)

In [7]:
def create_text_based_index_from_docs(chunks, text_fields, keyword_fields=[]): 
    text_based_index = Index(
        text_fields=text_fields,
        keyword_fields=keyword_fields
    )
    
    text_based_index.fit(chunks)

    return text_based_index

text_based_index = create_text_based_index_from_docs(pytorch_chunks, text_fields=["chunk", "filename"])

In [8]:
def create_vector_based_index_from_docs(chunks, chunk_field, keyword_fields=[], emb_model='multi-qa-distilbert-cos-v1'):
    """
    TODO
    """
    # get an embedding model
    embedding_model = SentenceTransformer(emb_model)
    
    embeddings = []

    for d in tqdm(chunks):
        v = embedding_model.encode(d[chunk_field])
        embeddings.append(v)

    # np array for computational efficancy
    embeddings = np.array(embeddings)
    
    vector_index = VectorSearch(keyword_fields=keyword_fields)
    print(embeddings[0])
    print(chunks[0])
    vector_index.fit(embeddings, chunks)

    return embedding_model, embeddings, vector_index

embedding_model, embeddings, vector_index = create_vector_based_index_from_docs(pytorch_chunks, 'chunk')

  0%|          | 0/2513 [00:00<?, ?it/s]

[ 1.28202299e-02  1.89559266e-03  1.24928607e-02  4.42143530e-02
 -5.81073016e-03  7.40154311e-02 -5.36486357e-02  2.40743216e-02
 -5.07658347e-03 -1.14630386e-02  4.50821780e-02  2.67908033e-02
  6.73636720e-02  6.58322424e-02  7.61403367e-02  1.60189718e-02
 -3.51745449e-02  1.51376110e-02 -6.62834719e-02 -1.67972837e-02
 -2.42675245e-02  6.45295251e-03  3.68586034e-02 -1.30323861e-02
  7.71475583e-03  1.47242576e-03  9.19455569e-03 -1.30863916e-02
  1.62333045e-02 -4.35524173e-02 -1.83547251e-02  2.38356423e-02
  2.76815221e-02 -2.09894162e-02  6.76169991e-02  1.45805255e-03
 -5.42983860e-02  2.40574013e-02 -2.88685914e-02 -1.97049696e-02
  3.73304896e-02  4.89544263e-03 -5.34528792e-02  8.42607301e-03
 -1.91388768e-03  8.44316483e-02  3.24483328e-02 -5.37673794e-02
  5.97448647e-02 -4.85001169e-02  4.08163248e-03 -1.01210619e-03
  1.57010406e-02 -2.20096074e-02  3.67065556e-02 -6.29529264e-03
  1.46376295e-02 -4.32393216e-02  6.26215413e-02  8.70219548e-04
 -6.15577810e-02  3.64422

In [9]:
def text_search(query):
    return text_based_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return vector_index.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results

In [13]:
hybrid_search("What is a Tensor?")

[{'start': 9600,
  'chunk': 'create a regular Tensor\n(using a factory function) and then perform a Tensor-DTensor operation,\nlike the following:\n\n```\ntensor = torch.arange(10)\nreturn tensor + dtensor\n```\n\nWe disallow mixed Tensor-DTensor operations: if the input to any operations\n(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.\nThis is because the semantics are ambiguous. We don\'t know if `tensor` is\nthe same across ranks or if it is different so we ask that the user\nfigure out how to construct a DTensor with accurate placements from `tensor`.\n\nIf each rank does have the same `tensor`, then please construct a replicated\nDTensor:\n\n```\ntensor = torch.arange(10)\ntensor = DTensor.from_local(tensor, placements=(Replicate(),))\nreturn tensor + dtensor\n```\n\nIf you wanted to create a DTensor with shards, below is how to do it.\nSemantically this means that your Tensor data is split between the shards\nand that operations act on the "full stacked d