```shell
pip install -U sentence-transformers
```

In [1]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import lynse

  from tqdm.autonotebook import tqdm, trange


## Load dataset

In [2]:
title_text_dataset = load_dataset("mixedbread-ai/wikipedia-data-en-2023-11", split="train", num_proc=4).select_columns(["title", "text"])

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
title_text_dataset[0]

{'title': 'British Arab Commercial Bank',
 'text': 'The British Arab Commercial Bank PLC (BACB) is an international wholesale bank incorporated in the United Kingdom that is authorised by the Prudential Regulation Authority (PRA) and regulated by the PRA and the Financial Conduct Authority (FCA). It was founded in 1972 as UBAF Limited, adopted its current name in 1996, and registered as a public limited company in 2009. The bank has clients trading in and out of developing markets in the Middle East and Africa.'}

## Create Embedding Model

In [4]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=device)



## Launch server

In [5]:
lynse.launch_in_jupyter()

Server running at http://127.0.0.1:7637



## Connect to server

In [7]:
client = lynse.VectorDBClient("http://127.0.0.1:7637")

## Create a database

In [8]:
search_db = client.create_database("semantic_search_db", drop_if_exists=False)

## Create a collection

In [10]:
collection = search_db.require_collection(
    "semantic_search", 
    dim=encoder.get_sentence_embedding_dimension(), 
    drop_if_exists=False, 
    cache_chunks=-1, 
    scaler_bits=8
)


2024-06-18 11:56:51 - LynseDB - INFO - Creating collection semantic_search with: 
//    dim=384, collection='semantic_search', 
//    chunk_size=100000, distance='cosine', 
//    dtypes='float32', use_cache=True, 
//    scaler_bits=8, n_threads=10, 
//    warm_up=False, drop_if_exists=False, 
//    description=None, 


## Insert data to collection

In [11]:
from tqdm import tqdm

with collection.insert_session() as session:
    texts = []
    count = 0
    ids = []
    for id, text in tqdm(enumerate(title_text_dataset), total=title_text_dataset.shape[0], unit="vectors"):
        texts.append(text["text"])
        ids.append(id)
        count += 1
        
        if count == 100000:
            vecs = encoder.encode(texts)
            for id, vec in zip(ids, vecs):
                session.add_item(vector=vec, id=id, normalize=True)
            texts = []
            count = 0
            ids = []

  0%|          | 192818/41488110 [04:32<2:13:19, 5162.52vectors/s] 
2024-06-18 12:05:04 - LynseDB - INFO - Creating collection test_collection with: 
//    dim=4, collection='test_collection', 
//    chunk_size=100000, distance='cosine', 
//    dtypes='float32', use_cache=True, 
//    scaler_bits=8, n_threads=10, 
//    warm_up=False, drop_if_exists=True, 
//    description=None, 

2024-06-18 12:05:04 - LynseDB - INFO - Creating collection test_collection with: 
//    dim=4, collection='test_collection', 
//    chunk_size=100000, distance='cosine', 
//    dtypes='float32', use_cache=True, 
//    scaler_bits=8, n_threads=10, 
//    warm_up=False, drop_if_exists=True, 
//    description=None, 

2024-06-18 12:05:04 - LynseDB - INFO - Saving data...
2024-06-18 12:05:04 - LynseDB - INFO - Writing chunk to storage...
2024-06-18 12:05:04 - LynseDB - INFO - Writing chunk to storage done.
2024-06-18 12:05:06 - LynseDB - INFO - Creating collection test_collection with: 
//    dim=4, collection='

KeyboardInterrupt: 

In [12]:
collection.shape

(200000, 384)

## Build index

In [10]:
# collection.remove_index()
# collection.build_index("IVF-FLAT", n_clusters=title_text_dataset.shape[0] // 100)

## Search

In [17]:
import pandas as pd

def search(s: str, collection=collection):
    vs = encoder.encode(s)
    ids = collection.search(vs, k=10)[0]

    res = []
    for id in ids:
        res.append(title_text_dataset[int(id)])

    print(collection.search_report_)
    return pd.DataFrame(res, columns=['title', 'text'])

In [24]:
search("Its sales for the year ending 30 June 2005")


* - MOST RECENT SEARCH REPORT -
| - Collection Shape: (41400000, 384)
| - Search Time: 0.00001 s
| - Search Distance: cosine
| - Search K: 10
| - Top 10 Results ID: [19187417 35822415 13622543 27174601 25960910 14387415 12289258  5661016
 15936063 36784954]
| - Top 10 Results Similarity: [0.581252 0.578115 0.571216 0.56473  0.561913 0.558727 0.555336 0.553917
 0.550491 0.550025]



Unnamed: 0,title,text
0,Hamilton (musical),In March 2023 it was announced that the produc...
1,The Man in the White Suit (play),The production was due to run until 11 January...
2,Perthshire Amber,In 2005 it began as a two-day event. It was so...
3,BBS: The Documentary,"Production work began in July 2001, and comple..."
4,Meadham Kirchhoff,"Their final collection was Spring/Summer 2015,..."
5,Fable II,"According to NPD, the game attained the status..."
6,Finding Neverland (film),It had a limited release in the United States ...
7,Chevrolet Corvette (C7),Production for the 2017 model year began on Ju...
8,Chinguetti oil field,It was later sold to PETRONAS group. Productio...
9,Kia Ceed,"First shown in June 2015, a facelifted model o..."
