! pip install -q -U pypdf faiss-cpu
! pip install -q -U InstructorEmbedding
! pip install huggingface_hub -q
! pip install gradio -q
! pip install langchain==0.1.2 
! pip install sentence_transformers==2.2.2

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import glob
import textwrap
import time
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
import torch
from langchain.vectorstores import FAISS

In [4]:
class CFG:
    model_name = 'microsoft/Phi-3-mini-4k-instruct'
    temperature = 0.5
    top_p = 0.95
    repetition_penalty = 1.15
    do_sample = True
    max_new_tokens = 400
    num_return_sequences=1

    split_chunk_size = 800
    split_overlap = 0
    
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    k = 3
    PDFs_paths = [
        r'C:\Users\hgchg\Desktop\Visual Studio\LLM\Hammett JACS 1937.pdf',
        r'C:\Users\hgchg\Desktop\Visual Studio\LLM\Hansch CR 1991.pdf',
        r'C:\Users\hgchg\Desktop\Visual Studio\LLM\Hecht JACS 2017.pdf',
        r'C:\Users\hgchg\Desktop\Visual Studio\LLM\Hammett equation - Wikipedia.pdf',
        r'C:\Users\hgchg\Desktop\Visual Studio\LLM\workflow_0523.pdf'
    ]
    Embeddings_path = './faiss_index_py'


In [5]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
from langchain.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id = CFG.model_name,
    model_kwargs={
        "max_new_tokens": CFG.max_new_tokens,
        "temperature": CFG.temperature,
        "top_p": CFG.top_p,
        "repetition_penalty": CFG.repetition_penalty,
        "do_sample": CFG.do_sample,
        "num_return_sequences": CFG.num_return_sequences
    }
) 

In [6]:
# Load each PDF using PyPDFLoader
documents = []
for pdf_path in CFG.PDFs_paths:
    loader = PyPDFLoader(pdf_path)
    documents.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

In [7]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cpu"}
)

vectordb = FAISS.from_documents(
    documents = texts, 
    embedding = embeddings
)

vectordb.save_local("faiss_index_py")

load INSTRUCTOR_Transformer
max_seq_length  512


In [8]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cpu"}
)

vectordb = FAISS.load_local(
    CFG.Embeddings_path,
    embeddings
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
prompt_template = """
<s>[INST] 
Don't try to make up an answer. If you don't know, just say that you don't know.
Answer in the same language the question was asked.
Provide a concise and accurate answer based on the provided context.
Make sure to reference specific chemical theories, formulas, or data from the context.
Use technical and professional language suitable for a chemistry research paper.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:[/INST]"""


In [11]:


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["question", "context"]
)
llm_chain = LLMChain(prompt=PROMPT, llm=llm)
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [12]:
def wrap_text_preserve_newlines(text, width=700):
    lines = text.split('\n')

    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4] + ' - page: ' + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [13]:
def llm_ans(query):
    start = time.time()
    llm_response = qa_chain(query)
    ans = process_llm_response(llm_response)
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans.strip() + time_elapsed_str
def extract_text_after_inst(input_string):
    marker_index = input_string.find("[/INST]")
    
    if marker_index != -1:
        return input_string[marker_index + len("[/INST]"):].strip()
    else:
        return ""

In [14]:
import gradio as gr
import pandas as pd

# 讀取CSV文件
data = pd.read_csv("Question.csv")

def predict(message):
    # 这里假设你已经有了一个函数来根据问题生成答案
    output = str(llm_ans(message))
    output = extract_text_after_inst(output)
    return output

CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 500vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto;}
"""

with gr.Blocks(css=CSS) as demo:
    with gr.Row(): 
        with gr.Column():
            chat_interface = gr.ChatInterface(
                fn=predict,
                title='Open-Source LLM for Python Question Answering'
            )

# 循环处理问题，并将答案填写到CSV文件中
for index, row in data.iterrows():
    question = row['question']
    answer = predict(question)
    # 将答案写入对应的answer列
    data.at[index, 'answer'] = answer

# 将结果写回CSV文件，指定编码为UTF-8
data.to_csv("your_csv_file_with_answers.csv", index=False, encoding="utf-8")

demo.launch()

  warn_deprecated(


Running on local URL:  http://127.0.0.1:7862

Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB

To create a public link, set `share=True` in `launch()`.




In [11]:
import gradio as gr

def predict(message, history):
    output = str(llm_ans(message))
    output = extract_text_after_inst(output)
    return output

CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 500vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto;}
"""
            
with gr.Blocks(css=CSS) as demo:
    with gr.Row(): 
        with gr.Column():
            chat_interface = gr.ChatInterface(
                fn=predict,
                title='Open-Source LLM for Python Question Answering'
            )

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




  warn_deprecated(
Traceback (most recent call last):
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1374, in getresponse
    response.begin()
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 318, in begin
    version, status, reason = self._read_status()
  File "c:\Users\hgchg\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 287, in _read_status
    raise RemoteDisconnected("Remote end c