## !!! WARNING: The code block RIGHT BELOW should only be RAN ONCE, else you will have duplicate data in the database.

This can be reverted by running the drop table block though. Afterwards, you can run the code block to create the table and initialize the data again.


In [25]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
import torch
# import markdown


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


uri = "data/lancedb"
db = lancedb.connect(uri)

# Connect to existing table (usage in production)
# tbl = db.open_table(uri)

embedding_model = (
    get_registry()
    .get("sentence-transformers")
    .create(name="all-MiniLM-L6-v2", device=str(device))
)


# Set up LanceDB schema with Embedding API for automatic vectorization at ingestion and query time!
class LegalParagraphEmbedding(LanceModel):
    paragraph: str = embedding_model.SourceField()
    vector: Vector(embedding_model.ndims()) = embedding_model.VectorField()  # type: ignore


table = db.create_table(
    "legal_paragraphs",
    schema=LegalParagraphEmbedding,
    exist_ok=True,
)

# table.add(
#     [
#         {"paragraph": paragraph}
#         for md_file in md_files
#         for paragraph in extract_paragraphs_from_md(md_file)
#     ]
# )

In [26]:
# By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). For tables with more than 50K vectors, creating an ANN index is recommended to speed up search performance.
query = "How might a fraudulent trademark application be handled?"
actual = table.search(query).limit(5).to_pydantic(LegalParagraphEmbedding)
for a in actual:
    print(a.paragraph)

A. Trademark Infringement
There are many reasons for a commercial actor to develop, register, and police its
trademarks. The careful selection of a brand name will
forestall accusations that its brand infringes another trademark. Consistent,
regimented use of the brand in commerce should permit the brand to obtain
trademark status. Registration of the brand as a trademark will deter
infringement, prevent the registration of similar trademarks, and facilitate
enforcement efforts. Policing the manner in which others use the trademark will help retain the strength of the mark. Balanced enforcement of a trademark can preserve the distinctiveness of an exclusive source identifier while allowing for legitimate fair uses.
> [W]here the defendant uses a trademark to describe the plaintiff's product,
> rather than its own, we hold that a commercial user is entitled to a
> nominative fair use defense provided he meets the following three
> requirements: First, the product or service in question 

In [6]:
import lancedb

uri = "data/lancedb"
db = lancedb.connect(uri)
db.drop_database()

In [15]:
import os
from dotenv import load_dotenv
import requests
import json
from pprint import pprint

load_dotenv()

cl_api_key = os.getenv("COURT_LISTENER_API_KEY")

# response = requests.options(
#     "https://www.courtlistener.com/api/rest/v4/dockets/",
#     headers={"Authorization": f"Token {cl_api_key}"},
# )

query = "drunk got pulled over"
type = "o"  # case law opinions. See https://www.courtlistener.com/help/api/rest/search/#type
# result_type = "r"  # also maybe this, List of Federal cases with up to three nested documents
order_by = "score desc"
stat_Precedential = "on"
highlight = "on"
response = requests.get(
    f"https://www.courtlistener.com/api/rest/v4/search/?q={query}&type={type}&order_by={order_by}&stat_Precedential={stat_Precedential}&highlight={highlight}",
    headers={"Authorization": f"Token {cl_api_key}"},
)

data = response.json()
# print(type(data))
# print(data)
pprint(data)

max_cases_to_fetch = 50
cases = []
while data["next"] and len(cases) < max_cases_to_fetch:
    # print(type(data))
    cases.extend(data["results"])
    response = requests.get(
        data["next"], headers={"Authorization": f"Token {cl_api_key}"}
    )
    data = response.json()

print(json.dumps(cases[0], indent=4))
json.dump(cases, open("data/courtlistener_cases.json", "w"))

{'count': 9437,
 'next': 'https://www.courtlistener.com/api/rest/v4/search/?cursor=cz0xNi4yMTY2NCZzPTQxMzg3MDMmdD1vJmQ9MjAyNC0xMC0xMg%3D%3D&highlight=on&order_by=score+desc&q=drunk+got+pulled+over&stat_Precedential=on&type=o',
 'previous': None,
 'results': [{'absolute_url': '/opinion/4500585/state-v-jeffries/',
              'attorney': '',
              'caseName': 'State v. Jeffries',
              'caseNameFull': '',
              'citation': ['2018 Ohio 2010'],
              'citeCount': 0,
              'cluster_id': 4500585,
              'court': 'Ohio Court of Appeals',
              'court_citation_string': 'Ohio Ct. App.',
              'court_id': 'ohioctapp',
              'dateArgued': None,
              'dateFiled': '2018-05-23',
              'dateReargued': None,
              'dateReargumentDenied': None,
              'docketNumber': 'C-170176',
              'docket_id': 6874566,
              'judge': 'Myers',
              'lexisCite': '',
              'meta': {