In [7]:
from langchain.llms.ollama import Ollama
from langchain.chat_models import ChatOllama
from langchain.callbacks import StreamingStdOutCallbackHandler

from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredFileLoader #txt,pef,docx,jpg 등 다양한거 다 들고올 수 있음
from langchain.text_splitter import RecursiveCharacterTextSplitter # 문서 분할용

from langchain.embeddings import OllamaEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

# RAG(Retrieval Augmented Generation, 검색 증강 생성), Document
chat = ChatOllama(
    # model="gemma:latest",
    # model="llama2:latest",
    model="mistral:latest",
    temperature=0.1,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)

## Data Loaders and Splitters
splitter = RecursiveCharacterTextSplitter(
    # separators="\n",
    chunk_size=2600,
    chunk_overlap=100, # 앞 조각의 약간을 다음 조각에 덫붙임
)
loader = UnstructuredFileLoader("./files/mid_text_en.txt")
# loader.load()
# print(loader.load_and_split(text_splitter=splitter))

## Tiktoken
splitter2 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    separators="\n",
    chunk_size=2600,
    chunk_overlap=100
) #모델은 limit가 있어서 tiktoken단위보다 텍스트 길이로 하는것이 더 좋음

## Vectors
embedder = OllamaEmbeddings()

vector1 = embedder.embed_query("Hi")
# print(len(vector1))
vector2 = embedder.embed_documents([
    "hi",
    "how",
    "are",
    "you longer sentences because",
])
# print(len(vector2), len(vector2[0])) #4개의 벡터와 4096개의 차원

## embed를 매번 코드를 실행할때마다 하는 것이 아니라 vectors store에 캐싱해서 써야 함
# chroma(사용 - 로컬), FAISS, pinecone(클라우드), 다른 vector store 등
loader2 = UnstructuredFileLoader("./files/mid_text_ko.txt")
docs = loader2.load_and_split(text_splitter=splitter2)
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embedder, cache_dir)
# vectorstore = Chroma.from_documents(docs, embedder)
vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [None]:
results = vectorstore.similarity_search("Tell me the type of weather.") #vector store에서 검색
print(len(results))
results

In [None]:
from langchain.chains import RetrievalQA

# RetrievalQA is Legacy
chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
)
chain.run("Tell me the type of weather.")