In [None]:
import torch
from torch import nn
import re
from transformers import pipeline

Hi guys,

This is a super simple semantic search example I threw together.

Here is the idea:
 - We have some text we want to search for the answer to some query.
 - We split the text into subsections (see notes below regarding how we ought to do that).
 - We generate embeddings for the query and each subset of the text.
 - We then calculate cosine similarity between the
 query embeddings and each subsection embedding. The highest cosine similarity is (hopefully) the subset we are looking for.

 For this dummy example I chose a text generated by GPT and a query "What challeneges are facing machine learning?". Luckily it works out - we do return the answer we are looking for: "Despite its success, machine learning faces challenges such as data privacy, bias, and explainability."

 Here are some of the challenges I am seeing:
 - It might be difficult to decide how to best divide the text into subsections. I have implemented three methods so far:
  - Sentence by sentence.
  - Rolling multi-sentence window.
  - Using another model to divide the text based on continuation of logical ideas.
Each of these methods assumes that the author of the document is using punctuation. If the document was a bulleted list or something, or the author used periods in a weird way, these methods will struggle. Using another model to find useful subsections for us seems like a nice idea, but it will be computationally expensive to implement and we would have to do the work of training a second model or tracking down a good one.

- I don't know how well this will scale, or what it will take to scale it. I think it would be nice if I could query my whole vault at once, but this method would involve calculating cosine similarity between my query and every subset of every file in my whole vault. If we could find a way to rule out whole documents without processing the them every time in the way we are describing here, that would be nice. Or maybe the plugin just works on one document at a time.

There is some good news: I looked at the existing plugin for semantic search in Obsidian, and it requires an OpenAI API key -> it's pay to use. A non pay-to-use plugin would be great.

In [None]:
text = """# The History of Machine Learning

Machine learning is a field of artificial intelligence that focuses on building systems that learn from data. The concept has its roots in statistics, computer science, and mathematics. Early applications included tasks like optical character recognition and simple game-playing algorithms.

## Key Milestones

- **1950s**: Alan Turing proposes the Turing Test as a way to measure machine intelligence.
- **1980s**: Neural networks gain popularity with the backpropagation algorithm.
- **2010s**: Deep learning revolutionizes the field with breakthroughs in computer vision and natural language processing.

## Real-World Applications

Machine learning is used in:
- **Healthcare**: Diagnosing diseases from medical images.
- **Finance**: Fraud detection and algorithmic trading.
- **Entertainment**: Personalized recommendations on streaming platforms.

## Challenges and Future Prospects

Despite its success, machine learning faces challenges such as data privacy, bias, and explainability. Researchers are working toward more transparent, fair, and secure models.

---

Machine learning continues to evolve, shaping the future of technology and society.
"""

In [None]:
query = "What challeneges are facing machine learning?"
answer = "Despite its success, machine learning faces challenges such as data privacy, bias, and explainability."

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = model.encode(query, convert_to_tensor=True)

In [None]:
def split_sentences(text):
    # Split on ". " or ".\n" while keeping the period as part of the sentence
    sentences = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
    return [s.strip() for s in sentences if s.strip()]

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def semantic_splits(text, labels=["new_section", "continuation"]):
    """
    The idea of this function is to use another model to divide the
    text into subsectins intelligenty. Splitting by sentence
    may be insufficient if the answer we are looking for
    requires broader context. If we could split the text
    into suitable units regardless of sentence/paragraph,
    it might be better.

    Unfortunately this out of the box model does not do what we want.
    The subsections it picks are too big and it takes too long.
    Maybe if we fine tuned a lighter weight model we could get
    better results.
    """
    sentences = split_sentences(text)
    sections = []
    current_section = []

    for sentence in sentences:
        result = classifier(sentence, labels)
        if result['labels'][0] == "new_section":
            if current_section:
                sections.append(" ".join(current_section))
            current_section = [sentence]
        else:
            current_section.append(sentence)
    if current_section:
        sections.append(" ".join(current_section))
    return sections


2


In [None]:
def sliding_window_split(text, window_size=3, step=1):
    """
    This function splits the text into overlapping windows. Each window will be scored
    for similarity. The advantage this might offer is broader context than
    the sentence by sentence approach. I.e. if the answer was best represented by
    more than one sentence, this approach could be better.
    """
    sentences = split_sentences(text)  # Split by sentence endings
    sections = [
        ". ".join(sentences[i:i + window_size])
        for i in range(0, len(sentences), step)
        if sentences[i:i + window_size]
    ]
    return sections
print(len(sliding_window_split(text)))
for i in range (len(sliding_window_split(text))):
    print(sliding_window_split(text)[i])
    print("XXXXXXXXXXXXXXXXXXXX\n")
    print("XXXXXXXXXXXXXXXXXXXX\n")

18
# The History of Machine Learning. Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.. The concept has its roots in statistics, computer science, and mathematics.
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.. The concept has its roots in statistics, computer science, and mathematics.. Early applications included tasks like optical character recognition and simple game-playing algorithms.
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

The concept has its roots in statistics, computer science, and mathematics.. Early applications included tasks like optical character recognition and simple game-playing algorithms.. ## Key Milestones
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

Early applications included tasks like optical character recognition and simple game-playing algorithms.. ## Key Milestones. - **1950s**: Alan Turing

In [None]:
from torch import nn
cos = nn.CosineSimilarity(dim=1)

### Sentence split
sections = split_sentences(text)
section_embeddings = model.encode(sections, convert_to_tensor=True)
similarity_scores = cos(query_embedding.unsqueeze(0), section_embeddings)
print(similarity_scores)
best_match_index = similarity_scores.argmax().item()
answer_index = [i for i in range(len(sections)) if answer in sections[i]]
for i in answer_index:
    print(sections[i])
    print(f"Similary score = {similarity_scores[i]}")
    print("XXXXXXXXXXXXXXXXXXXX\n")
    print("XXXXXXXXXXXXXXXXXXXX\n")
print(f"It matched {sections[best_match_index]} the most with a score of {similarity_scores[best_match_index]}")


tensor([0.6302, 0.5494, 0.2786, 0.2994, 0.1320, 0.3451, 0.3579, 0.3672, 0.2826,
        0.6499, 0.2682, 0.2520, 0.1014, 0.2683, 0.6547, 0.3355, 0.0875, 0.6197])
Despite its success, machine learning faces challenges such as data privacy, bias, and explainability.
Similary score = 0.6546924114227295
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

It matched Despite its success, machine learning faces challenges such as data privacy, bias, and explainability. the most with a score of 0.6546924114227295


In [None]:
### Sliding window split
sections = sliding_window_split(text)
section_embeddings = model.encode(sections, convert_to_tensor=True)
similarity_scores = cos(query_embedding.unsqueeze(0), section_embeddings)
print(similarity_scores)
best_match_index = similarity_scores.argmax().item()
answer_index = [i for i in range(len(sections)) if answer in sections[i]]
for i in answer_index:
    print(sections[i])
    print(f"Similary score = {similarity_scores[i]}")
    print("XXXXXXXXXXXXXXXXXXXX\n")
    print("XXXXXXXXXXXXXXXXXXXX\n")


tensor([0.5584, 0.5887, 0.4008, 0.3702, 0.4374, 0.5338, 0.4347, 0.4841, 0.5388,
        0.5803, 0.2882, 0.2140, 0.3770, 0.6085, 0.5938, 0.6409, 0.6291, 0.6197])
- **Entertainment**: Personalized recommendations on streaming platforms.. ## Challenges and Future Prospects. Despite its success, machine learning faces challenges such as data privacy, bias, and explainability.
Similary score = 0.37702733278274536
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

## Challenges and Future Prospects. Despite its success, machine learning faces challenges such as data privacy, bias, and explainability.. Researchers are working toward more transparent, fair, and secure models.
Similary score = 0.6085018515586853
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXXXXXXXXXXXX

Despite its success, machine learning faces challenges such as data privacy, bias, and explainability.. Researchers are working toward more transparent, fair, and secure models.. ---
Similary score = 0.593777596950531
XXXXXXXXXXXXXXXXXXXX

XXXXXXXXXX

In [None]:
### Semantic split (This one does not really work)
sections = semantic_splits(text)
section_embeddings = model.encode(sections, convert_to_tensor=True)
similarity_scores = cos(query_embedding.unsqueeze(0), section_embeddings)
print(similarity_scores)
best_match_index = similarity_scores.argmax().item()
print(sections[best_match_index])

tensor([0.5922, 0.5917])
# The History of Machine Learning Machine learning is a field of artificial intelligence that focuses on building systems that learn from data. The concept has its roots in statistics, computer science, and mathematics.
