# Setup

In [6]:
import os
import json
import lancedb
# import dsutils
import numpy as np
import pyarrow as pa
from glob import glob
from tqdm import tqdm
from docarray.typing import NdArray
from typing import Optional
from docarray import BaseDoc, DocList
from FlagEmbedding import BGEM3FlagModel
from docarray.index import HnswDocumentIndex
from pymongo import MongoClient
from bson import ObjectId
from datetime import datetime

In [2]:
class Document(BaseDoc):
    _id: str
    timestamp: int
    company: str
    code: str
    summary: Optional[str] = None
    opinion: str
    provider: str
    goal_price: int
    closing_price: int
    embedding: Optional[NdArray] = None



## MongoDB

In [25]:
client = MongoClient()
hostname = 'mongo.stockhelper-mongodb.store'
username = 'root'
password = 'financial'
client = MongoClient(hostname, username=username, password=password)
db = client['financial']
report = db.report.find({})

In [35]:
documents = db.report.find(
            {
                'company': '롯데정밀화학'
            }
        ).sort('date')

In [36]:
documents = list(documents)

In [37]:
documents

[{'_id': ObjectId('6667313c109862e8bb9004a0'),
  'date': '2024/06/04',
  'company': '롯데정밀화학',
  'code': 'A004000',
  'summary': '실적은 바닥. 그린소재, 수소, 암모니아 기대\n최근 11차전력수급기본계획 발표. 암모니아와 수소 비중 확대. 동사는 암모니아 트레이딩 부문 강점\nScope 3 감축계획에 따라, 운송부분에서 탈탄소를 위한 암모니아 벙커링 역시 중요해 질 전망',
  'opinion': ' BUY',
  'provider': '현대차증권',
  'goal_price': 70000.0,
  'closing_price': 47800}]

In [6]:
def convert_objectid(doc):
    if "_id" in doc:
        doc["_id"] = str(doc["_id"])
        
    if 'date' in doc:
        doc['date'] = int(datetime.strptime(doc['date'], '%Y/%m/%d').timestamp())

    return doc

In [7]:
report = list(report)
report = [convert_objectid(doc) for doc in report]

In [9]:
print(report[0])
print(len(report))

{'_id': '6667313c109862e8bb900497', 'date': 1717459200, 'company': 'CJ대한통운', 'code': 'A000120', 'summary': '분명한 성장 방향성\n화물 소형화 바람 본격화\n실적 지속 우상향 전망', 'opinion': ' BUY', 'provider': 'LS증권', 'goal_price': 159000.0, 'closing_price': 101800}
309


## LanceDB

In [10]:
uri = "/workspace/008_PseudoLab/server/vectorstore"
db = lancedb.connect(uri)
# async_db = await lancedb.connect_async(uri)

In [11]:
schema = pa.schema([
    pa.field('id', pa.string()),
    pa.field('timestamp', pa.int64()),
    pa.field('company', pa.string()),
    pa.field('code', pa.string()),
    pa.field('summary', pa.string()),
    pa.field('opinion', pa.string()),
    pa.field('provider', pa.string()),
    pa.field('goal_price', pa.int64()),
    pa.field('closing_price', pa.int64()),
    pa.field("embedding", pa.list_(pa.float32(), list_size=1024))
])

table = db.create_table("report", schema=schema, mode="overwrite")

[2024-06-12T05:23:01Z WARN  lance::dataset] No existing dataset at /workspace/008_PseudoLab/server/vectorstore/report.lance, it will be created


In [12]:
table = db.open_table('report')
# async_tbl = async_db.open_table('news')

In [13]:
embedding_function = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False, device='cuda')

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 335544.32it/s]
  return self.fget.__get__(instance, owner)()


In [11]:
from tqdm.auto import tqdm

docs = []

for data in tqdm(news):
    tmp = 'timestamp: ' + str(datetime.fromtimestamp(data['timestamp'])) + '\n' \
        + 'title: ' + data['title'] + '\n' \
            + 'content: ' + data['  `content']
    embedding = embedding_function.encode(tmp, return_dense=True, return_sparse=False)
    data['embedding'] = embedding['dense_vecs']
    docs.append(Document(**data))

docs = DocList[Document](docs)
docs = [dict(d) for d in docs]
table.add(docs)

100%|██████████| 13236/13236 [06:37<00:00, 33.33it/s]


In [None]:
# async_db = await lancedb.connect_async(uri)

## LanceDB Test

In [None]:
table = db.open_table('news')
table.count_rows()
# async_tbl.count_rows()

In [None]:
# table.search("2024-06-10").limit(10).to_list()
table.head()

In [None]:
embedding_function = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False, device='cpu')

In [None]:
from datetime import datetime

query = '삼성전자의 주식과 관련된 뉴스를 알려줘'
query = embedding_function.encode(query)['dense_vecs']
start_time = '2024-06-08'
start_stamp = datetime.strptime(start_time, "%Y-%m-%d").timestamp()

end_time = '2024-06-10'
end_stamp = datetime.strptime(end_time, "%Y-%m-%d").timestamp()
print(end_stamp)
end_stamp += 3600
print(end_stamp)
docs = table.search(query).where(f"(timestamp >= {int(start_stamp)}) AND (timestamp < {int(end_stamp)})", prefilter=True).limit(5).to_list()
# docs = table.search(query).limit(5).to_list()
# docs = async_tbl.search(query).limit(8).to_list()

In [None]:
end_time = '2024-06-11'
end_stamp = datetime.strptime(end_time, "%Y-%m-%d").timestamp()
print(end_stamp)

In [None]:
[(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs]

In [None]:
sorted([(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs], key=lambda x : x[0])

In [None]:
query = '최근 삼성전자의 신제품에 대해 알려줘'
query = embedding_function.encode(query)['dense_vecs']
docs = table.search(query).limit(5).where(f"timestamp > {int(date)}", prefilter=True).to_list()

In [None]:
sorted([(str(datetime.fromtimestamp(doc['timestamp'])), doc['title']) for doc in docs], key=lambda x : x[0])

In [None]:
a = "asidhfioas"

a[:12034]