## !!! WARNING: The code block RIGHT BELOW should only be RAN ONCE, else you will have duplicate data in the database.

This can be reverted by running the drop table block though. Afterwards, you can run the code block to create the table and initialize the data again.


In [25]:
from pathlib import Path
from typing import List
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
import torch
# import markdown


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def extract_paragraphs_from_md(filepath: Path) -> List[str]:
    with filepath.open("r", encoding="utf-8") as f:
        md_content = f.read()

    # html_content = markdown.markdown(md_content)
    # paragraphs = html_content.split("\n\n")  # Split on double newline
    paragraphs = md_content.split("\n\n")  # Split on double newline
    return paragraphs


md_files = [
    Path("./data/opencasebook/ambiguity.md"),
    Path("./data/opencasebook/authorship.md"),
    Path("./data/opencasebook/cla-policy.md"),
    Path("./data/opencasebook/patents.md"),
    Path("./data/opencasebook/remedies.md"),
    Path("./data/opencasebook/trademarks.md"),
]

uri = "data/lancedb"
db = lancedb.connect(uri)

# Connect to existing table (usage in production)
# tbl = db.open_table(uri)

embedding_model = (
    get_registry()
    .get("sentence-transformers")
    .create(name="all-MiniLM-L6-v2", device=str(device))
)


# Set up LanceDB schema with Embedding API for automatic vectorization at ingestion and query time!
class LegalParagraphEmbedding(LanceModel):
    paragraph: str = embedding_model.SourceField()
    vector: Vector(embedding_model.ndims()) = embedding_model.VectorField()  # type: ignore


table = db.create_table(
    "legal_paragraphs",
    schema=LegalParagraphEmbedding,
    exist_ok=True,
)

table.add(
    [
        {"paragraph": paragraph}
        for md_file in md_files
        for paragraph in extract_paragraphs_from_md(md_file)
    ]
)

In [26]:
# By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). For tables with more than 50K vectors, creating an ANN index is recommended to speed up search performance.
query = "How might a fraudulent trademark application be handled?"
actual = table.search(query).limit(5).to_pydantic(LegalParagraphEmbedding)
for a in actual:
    print(a.paragraph)

A. Trademark Infringement
There are many reasons for a commercial actor to develop, register, and police its
trademarks. The careful selection of a brand name will
forestall accusations that its brand infringes another trademark. Consistent,
regimented use of the brand in commerce should permit the brand to obtain
trademark status. Registration of the brand as a trademark will deter
infringement, prevent the registration of similar trademarks, and facilitate
enforcement efforts. Policing the manner in which others use the trademark will help retain the strength of the mark. Balanced enforcement of a trademark can preserve the distinctiveness of an exclusive source identifier while allowing for legitimate fair uses.
> [W]here the defendant uses a trademark to describe the plaintiff's product,
> rather than its own, we hold that a commercial user is entitled to a
> nominative fair use defense provided he meets the following three
> requirements: First, the product or service in question 

In [27]:
db.drop_table("legal_paragraphs", ignore_missing=True)