In [5]:
%pip install -U gpt4all chromadb langchainhub sentence-transformers faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [1]:
!CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Obtaining dependency information for typing-extensions>=4.5.0 from https://files.pythonhosted.org/packages/24/21/7d397a4b7934ff4028987914ac1044d3b7d52712f30e2ac7a2ae5bc86dd0/typing_extensions-4.8.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.8.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Obtaining dependency information for numpy>=1.20.0 from https://files.pythonhosted.org/packages/2e/54/218ce51bb571a70975f223671b2a86aa951e8

In [1]:
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

loader = CSVLoader("./cleaned_data_knowledge.csv")
documents = loader.load()

# 데이터를 불러와서 텍스트를 일정한 수로 나누고 구분자로 연결하는 작업
text_splitter = CharacterTextSplitter(
	chunk_size=1000, 
    chunk_overlap=0, 
    separator="\n"
    )
texts = text_splitter.split_documents(documents)

print(len(texts))

286


In [2]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# 임베딩 모델 로드
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

# 문서에 있는 텍스트를 임베딩하고 FAISS 에 인덱스를 구축함
index = FAISS.from_documents(
	documents=texts,
	embedding=embeddings,
	)

# faiss_db 로 로컬에 저장하기
index.save_local("faiss_db")
# faiss_db 로 로컬에 로드하기
docsearch = FAISS.load_local("faiss_db", embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

n_gpu_layers = 1
CallbackManager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
	# model_path: 로컬머신에 다운로드 받은 모델의 위치
    model_path="/Volumes/Jinho/AIDoc_test_models/llama-2-7b-pubmed-qa-211k.gguf",
    temperature=0.0,
    top_p=1,
    max_tokens=8192,
    verbose=True,
    # n_ctx: 모델이 한 번에 처리할 수 있는 최대 컨텍스트 길이
    n_ctx=4096,
    n_gpu_layers=n_gpu_layers,
)

llama_model_loader: loaded meta data with 15 key-value pairs and 291 tensors from /Volumes/Jinho/AIDoc_test_models/llama-2-7b-pubmed-qa-211k.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q8_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q8_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q8_0     [ 1

In [9]:

from langchain import PromptTemplate, LLMChain


template = """
### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

### Input:
{question}

### Response:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])



llm_chain = LLMChain(prompt=prompt, llm=llm)



In [10]:
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains import RetrievalQA

# 유사도 0.7로 임베딩 필터를 저장
# 유사도에 맞추어 대상이 되는 텍스트를 임베딩함
embeddings_filter = EmbeddingsFilter(
    embeddings=embeddings, 
    similarity_threshold=0.70
)
# 압축 검색기 생성
compression_retriever = ContextualCompressionRetriever(
	# embeddings_filter 설정
    base_compressor=embeddings_filter, 
    # retriever 를 호출하여 검색쿼리와 유사한 텍스트를 찾음
    base_retriever=docsearch.as_retriever()
)
# RetrievalQA 클래스의 from_chain_type이라는 클래스 메서드를 호출하여 질의응답 객체를 생성
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=compression_retriever)

ggml_metal_free: deallocating


In [None]:
prompt = """
"""

response = llm_chain.run(prompt)
print(response)

In [11]:
prompt = """A 16-year-old girl presents to the emergency department with a 3-day history of abdominal pain. She describes the onset as being initially at the umbilical region and then gradually migrating to the right lower quadrant (RLQ). The pain has escalated in severity, and she currently rates it as 7 on the Numeric Rating Scale (NRS), noting that it has become severe enough to hinder her movements. She denies consuming any unusual foods recently. She also reports feelings of nausea. On examination, there is tenderness elicited upon palpation of the RLQ. What is the most likely diagnosis? Answer only one most likely diagnosis and do not say anything else."""

response = qa.run(prompt)

print(response)

 The most likely diagnosis is appendicitis.



llama_print_timings:        load time =    8150.66 ms
llama_print_timings:      sample time =       2.30 ms /    11 runs   (    0.21 ms per token,  4782.61 tokens per second)
llama_print_timings: prompt eval time =   70227.31 ms /  2893 tokens (   24.27 ms per token,    41.19 tokens per second)
llama_print_timings:        eval time =     593.01 ms /    10 runs   (   59.30 ms per token,    16.86 tokens per second)
llama_print_timings:       total time =   71337.10 ms


In [22]:
import pandas as pd

# Read the '20qa.csv' file
testData = pd.read_csv('./20qa.csv', encoding='utf-8')

# Initialize an empty DataFrame with the specified columns
columns = ['question', 'answer', 'response']
result = pd.DataFrame(columns=columns)

# Append rows to the DataFrame for each question-answer pair
for i in range(len(testData)):
    prompt = testData['question'][i]
    response = qa.run(prompt)  # Make sure 'qa.run' is a valid function or method
    new_row = pd.DataFrame([[prompt, testData['answer'][i], response]], columns=columns)
    result = pd.concat([result, new_row], ignore_index=True)

# Save the DataFrame to a CSV file
result.to_csv("./7b_pubmed_rag.csv", index=False, encoding='utf-8')


Llama.generate: prefix-match hit

llama_print_timings:        load time =    8150.66 ms
llama_print_timings:      sample time =       1.99 ms /    11 runs   (    0.18 ms per token,  5522.09 tokens per second)
llama_print_timings: prompt eval time =   53576.94 ms /  2188 tokens (   24.49 ms per token,    40.84 tokens per second)
llama_print_timings:        eval time =     598.53 ms /    10 runs   (   59.85 ms per token,    16.71 tokens per second)
llama_print_timings:       total time =   54603.60 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =    8150.66 ms
llama_print_timings:      sample time =       2.11 ms /    12 runs   (    0.18 ms per token,  5687.20 tokens per second)
llama_print_timings: prompt eval time =   52236.89 ms /  2207 tokens (   23.67 ms per token,    42.25 tokens per second)
llama_print_timings:        eval time =     652.76 ms /    11 runs   (   59.34 ms per token,    16.85 tokens per second)
llama_print_timings:       total time =   53