In [1]:
import os
from pathlib import Path

import re

from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def crawl_markdown_files(root_dir, annoy_index, model):
    """
    Crawls through directory and extracts text from markdown files.
    Returns a dict with filepath as key and content as value.
    """
    results = {}
    id_mapping = {}
    id_counter = 0
    for path in Path(root_dir).rglob('*.md'):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                txt = f.read()
                splits = split_text(txt) # -> []
                splits = usefulness_filter(splits)
                id_counter, id_mapping = encode_and_store(splits, path, model, annoy_index, id_counter, id_mapping)
        except Exception as e:
            print(f"Error reading {path}: {e}")

    return id_mapping

def usefulness_filter(split_text):
    # This function will probably become it's own module
    # There are lots of potential ways we could filter before embedding
    # and it might not be easy to determine what is best.
    filtered = []

    for chunk in split_text:
        # Rule 1: Remove empty or whitespace-only chunks
        if not chunk.strip():
            continue
        
        # Rule 2: Exclude very short chunks (less than 5 characters)
        if len(chunk) < 5:
            continue

        # Rule 3: Remove non-alphanumeric chunks
        if not any(char.isalnum() for char in chunk):
            continue

        # Add to the filtered list if it passes all checks
        filtered.append(chunk)

    return filtered


def split_text(text):
    """
    Splits the input text into segments based on the following rules:
    - Split on ., ?, !
    - Split on new lines
    """
    # Use a regular expression to split on ., ?, !, or new lines (\n)
    segments = re.split(r'[.?!\n]+', text)
    
    # Remove any empty strings or leading/trailing whitespace
    return [segment.strip() for segment in segments if segment.strip()]

def encode_and_store(split_text, path, model, annoy_index, id_counter, id_mapping):
    # Indexes: Path_id1, Path_id2
    # Encode text and store in Annoy
    for txt in split_text:
        embedding = model.encode(txt)
        id_mapping.update({id_counter: str(path)})
        annoy_index.add_item(id_counter, embedding)
        id_counter += 1
    return id_counter, id_mapping




In [7]:
from annoy import AnnoyIndex

# Parameters
embedding_dim = 384  # Dimensionality of your embeddings
metric = 'angular'  # Metric for similarity ('angular', 'euclidean', 'manhattan', etc.)
annoy_index = AnnoyIndex(embedding_dim, metric)
model = SentenceTransformer('all-MiniLM-L6-v2')
root_dir = Path("TestData\\work")
id_mapping = crawl_markdown_files(root_dir, annoy_index, model)

# Build the index (number of trees affects query speed/accuracy tradeoff)
num_trees = 10
annoy_index.build(num_trees)

# Save the index to a file
annoy_index.save('embeddings.ann')




True

In [9]:
def query_annoy(query, model, id_mapping):
    query_embedding = model.encode(query)
    results = annoy_index.get_nns_by_vector(query_embedding, 5, include_distances=True)
    top_index = results[0][0]
    top_doc = id_mapping[top_index]
    return top_doc

In [12]:
print(query_annoy("Trickiest part of hedge effectiveness report", model, id_mapping))

TestData\work\Now.md


In [None]:
# To load the index later:
loaded_index = AnnoyIndex(embedding_dim, metric)
loaded_index.load('embeddings.ann')

# Perform a similarity search
query_vector = [0.2, 0.3, 0.4, ..., 0.128]  # Replace with actual query embedding
nearest_ids = loaded_index.get_nns_by_vector(query_vector, 5, include_distances=True)
print(nearest_ids)

In [None]:
crawled_result = crawl_markdown_files('TestData\\work')
list_crawled_result_values = list(crawled_result.values())

model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer("msmarco-distilbert-dot-v5")
f = 768  # Length of item vector that will be indexed

crawled_result_embeddings = model.encode(
    list_crawled_result_values,
    convert_to_tensor=True
)
crawled_result_embeddings