# Setup

In [1]:
import os
import json
import lancedb
# import dsutils
import numpy as np
import pyarrow as pa
from glob import glob
from tqdm import tqdm
from docarray.typing import NdArray
from typing import Optional
from docarray import BaseDoc, DocList
from FlagEmbedding import BGEM3FlagModel
from docarray.index import HnswDocumentIndex
from pymongo import MongoClient
from bson import ObjectId
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
class Document(BaseDoc):
    _id: str
    text: str
    source: str
    page: int
    total_pages: int
    embedding: Optional[NdArray] = None

## MongoDB

In [2]:
client = MongoClient()
hostname = 'mongo.stockhelper-mongodb.store'
username = 'root'
password = 'financial'
client = MongoClient(hostname, username=username, password=password)
db = client['financial']
basic = db.basic.find({})

In [3]:
def convert_objectid(doc):
    if "_id" in doc:
        doc["_id"] = str(doc["_id"])
    return doc

In [4]:
basic = list(basic)
basic = [convert_objectid(doc) for doc in basic]

## LanceDB

In [8]:
uri = "/workspace/008_PseudoLab/server/vectorstore"
db = lancedb.connect(uri)
# async_db = await lancedb.connect_async(uri)

In [9]:
schema = pa.schema([
    pa.field('id', pa.string()),
    pa.field('text', pa.string()),
    pa.field('source', pa.string()),
    pa.field('page', pa.int64()),
    pa.field('total_pages', pa.int64()),
    pa.field("embedding", pa.list_(pa.float32(), list_size=1024))
])

table = db.create_table("basic", schema=schema, mode="overwrite")

[2024-06-14T09:36:25Z WARN  lance::dataset] No existing dataset at /workspace/008_PseudoLab/server/vectorstore/basic.lance, it will be created


In [10]:
table = db.open_table('basic')
# async_tbl = async_db.open_table('news')

In [12]:
from tqdm.auto import tqdm

docs = []

for data in tqdm(basic):
    docs.append(Document(**data))

docs = DocList[Document](docs)
docs = [dict(d) for d in docs]
table.add(docs)

100%|██████████| 94/94 [00:00<00:00, 31140.08it/s]


In [None]:
# async_db = await lancedb.connect_async(uri)

## LanceDB Test

In [None]:
table = db.open_table('news')
table.count_rows()
# async_tbl.count_rows()

In [None]:
# table.search("2024-06-10").limit(10).to_list()
table.head()

In [None]:
embedding_function = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False, device='cpu')

In [None]:
from datetime import datetime

query = '삼성전자의 주식과 관련된 뉴스를 알려줘'
query = embedding_function.encode(query)['dense_vecs']
start_time = '2024-06-08'
start_stamp = datetime.strptime(start_time, "%Y-%m-%d").timestamp()

end_time = '2024-06-10'
end_stamp = datetime.strptime(end_time, "%Y-%m-%d").timestamp()
print(end_stamp)
end_stamp += 3600
print(end_stamp)
docs = table.search(query).where(f"(timestamp >= {int(start_stamp)}) AND (timestamp < {int(end_stamp)})", prefilter=True).limit(5).to_list()
# docs = table.search(query).limit(5).to_list()
# docs = async_tbl.search(query).limit(8).to_list()

In [None]:
end_time = '2024-06-11'
end_stamp = datetime.strptime(end_time, "%Y-%m-%d").timestamp()
print(end_stamp)

In [None]:
[(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs]

In [None]:
sorted([(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs], key=lambda x : x[0])

In [None]:
query = '최근 삼성전자의 신제품에 대해 알려줘'
query = embedding_function.encode(query)['dense_vecs']
docs = table.search(query).limit(5).where(f"timestamp > {int(date)}", prefilter=True).to_list()

In [None]:
sorted([(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs], key=lambda x : x[0])

In [None]:
a = "asidhfioas"

a[:12034]