In [6]:
# Data preprocessing
import io
import zipfile
import requests
import frontmatter
# Data chunking
from openai import OpenAI
import re
from tqdm.auto import tqdm
# Search
from minsearch import Index
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np
from minsearch import VectorSearch

### Data Pre-processing

In [7]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    postfix = 'zip/refs/heads/main'
    url = f'{prefix}/{repo_owner}/{repo_name}/{postfix}'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    
    return repository_data    

In [8]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')
#### Project
pytorch_docs = read_repo_data('pytorch', 'pytorch')
####

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")
print(f"PyTorch documents: {len(pytorch_docs)}")

FAQ documents: 1217
Evidently documents: 95
PyTorch documents: 322


### Data Chunking

#### Simple chunking

In [9]:
def sliding_window(seq, size, step):
    """
    The following method creates a sliding window chunk based on a given sequence, based on size + step.
        
    :param str: Markdown text as a string
    :param size: the size of a batch in that seq
    :param step: a step, where if step = size, there's 0 overlap, and if step < size, there will be an overlap of "step"
    
    :return: List of chunks as strings
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [21]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

#### Sections based chunking

In [22]:
# def split_markdown_by_level(text, level=2):
#     """
#     Split markdown text by a specific header level.
    
#     :param text: Markdown text as a string
#     :param level: Header level to split on
#     :return: List of sections as strings
#     """
#     # This regex matches markdown headers
#     # For level 2, it matches lines starting with "## "
#     header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
#     pattern = re.compile(header_pattern, re.MULTILINE)

#     # Split and keep the headers
#     parts = pattern.split(text)
    
#     sections = []
#     for i in range(1, len(parts), 3):
#         # We step by 3 because regex.split() with
#         # capturing groups returns:
#         # [before_match, group1, group2, after_match, ...]
#         # here group1 is "## ", group2 is the header text
#         header = parts[i] + parts[i+1]  # "## " + "Title"
#         header = header.strip()

#         # Get the content after this header
#         content = ""
#         if i+2 < len(parts):
#             content = parts[i+2].strip()

#         if content:
#             section = f'{header}\n\n{content}'
#         else:
#             section = header
#         sections.append(section)

In [23]:
# evidently_chunks = []

# for doc in evidently_docs:
#     doc_copy = doc.copy()
#     doc_content = doc_copy.pop('content')
#     print(doc_content)
#     if doc_content:
#         sections = split_markdown_by_level(doc_content, level=2)
#         print(sections)
#         for section in sections:
#             section_doc = doc_copy.copy()
#             section_doc['section'] = section
#             evidently_chunks.append(section_doc)

#### LLM based chunking

In [24]:
# def llm(prompt, model=model):
    
#     openai_client = OpenAI()
#     model = 'gpt-5-nano'

#     messages = [
#         {"role": "user", "content": prompt}
#     ]

#     response = openai_client.responses.create(
#         model=model,
#         input=messages
#     )

#     return response.output_text

In [25]:
# prompt_template = """
# Split the provided document into logical sections
# that make sense for a Q&A system.

# Each section should be self-contained and cover
# a specific topic or concept.

# <DOCUMENT>
# {document}
# </DOCUMENT>

# Use this format:

# ## Section Name

# Section content with all relevant details

# ---

# ## Another Section Name

# Another section content

# ---
# """.strip()

In [26]:
# def intelligent_chunking(text):
#     prompt = prompt_template.format(document=text)
#     response = llm(prompt)
#     sections = response.split('---')
#     sections = [s.strip() for s in sections if s.strip()]
#     return sections

In [27]:
# evidently_chunks = []

# for doc in tqdm(evidently_docs):
#     doc_copy = doc.copy()
#     doc_content = doc_copy.pop('content')

#     sections = intelligent_chunking(doc_content)
#     for section in sections:
#         section_doc = doc_copy.copy()
#         section_doc['section'] = section
#         evidently_chunks.append(section_doc)

### Search Capabilities

#### Text based search

In [28]:
index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(evidently_chunks)

<minsearch.minsearch.Index at 0x1565639d0>

In [29]:
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)

In [30]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')

de_dtc_faq = [d for d in dtc_faq if 'data-engineering' in d['filename']]

faq_index = Index(
    text_fields=["question", "content"],
    keyword_fields=[]
)

faq_index.fit(de_dtc_faq)

<minsearch.minsearch.Index at 0x155e63610>

#### vector based search

In [31]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [32]:
# Take from the 2nd Dataset (DE DTC FAQ) records which has a filename
de_dtc_faq = [d for d in dtc_faq if 'data-engineering' in d['filename']]
record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content']
v_doc = embedding_model.encode(text)

query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)
similarity = v_query.dot(v_doc)

In [33]:
faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)

  0%|          | 0/449 [00:00<?, ?it/s]

In [34]:
faq_vindex = VectorSearch(keyword_fields=[])

faq_vindex.fit(faq_embeddings, de_dtc_faq)

<minsearch.vector.VectorSearch at 0x107e9fb60>

In [35]:
query = 'Can I join the course now?'
q = embedding_model.encode(query)
results = faq_vindex.search(q)

In [36]:
print(results[1])

{'id': '068529125b', 'question': 'Course - Can I follow the course after it finishes?', 'sort_order': 8, 'content': 'Yes, we will keep all the materials available, so you can follow the course at your own pace after it finishes.\n\nYou can also continue reviewing the homeworks and prepare for the next cohort. You can also start working on your final capstone project.', 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/008_068529125b_course-can-i-follow-the-course-after-it-finishes.md'}


In [39]:
evidently_chunks[0].keys()

dict_keys(['start', 'chunk', 'title', 'description', 'filename'])

In [40]:
# Evidently
evidently_embeddings = []

for d in tqdm(evidently_chunks):
    v = embedding_model.encode(d['chunk'])
    evidently_embeddings.append(v)

evidently_embeddings = np.array(evidently_embeddings)

evidently_vindex = VectorSearch(keyword_fields=[])
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

  0%|          | 0/575 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x1576c4550>

In [41]:
evidently_vindex.search(query, num_results=1)

AttributeError: 'str' object has no attribute 'reshape'

#### Hybrid search

In [None]:
faq_index.search(query, num_results=5)

In [None]:
query = 'What are the course pre-requesites?'

text_results = faq_index.search(query, num_results=5)

q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)

final_results = text_results + vector_results

In [None]:
from time import sleep
for result in final_results:
    print(result)
    sleep(2)

In [None]:
def text_search(query):
    return faq_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results