In [2]:
import secrets
import json
from pathlib import Path
import pprint
import pdb
from typing import Any

from etl import markdown, pdfs, shared, videos
from etl.shared import display_modal_image

import docstore
import vecstore
from utils import pretty_log

pp = pprint.PrettyPrinter(indent=2)

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, TextStreamer
import json
import textwrap
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
import langchain
import time


%load_ext autoreload
import ciptest_qanda as test

In [3]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, TextStreamer
import json
import textwrap
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory

embedding_engine = vecstore.get_embedding_engine(allowed_special="all")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          token=True)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.bfloat16,
                                             token=True,
                                             #  load_in_8bit=True,
                                             #  load_in_4bit=True,
                                             )
streamer = TextStreamer(tokenizer, skip_prompt=True)

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens=4096,
                do_sample=True,
                # temperature=0.1,
                top_p=0.95,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
                streamer=streamer,
                )

llm = HuggingFacePipeline(pipeline=pipe)

from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain

pretty_log("connecting to vector storage")
vector_index = vecstore.connect_to_vector_index(vecstore.INDEX_NAME, embedding_engine)
pretty_log("connected to vector storage")
pretty_log(f"found {vector_index.index.ntotal} vectors to search over")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[1;38;5;214m🥞: connecting to vector storage [0m


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:67: Error: 'f' failed: could not open vectors/openai-ada-fsdl.faiss for reading: No such file or directory

In [3]:
########## THE LAMA 2 DEMO ##############

template = """
[INST]Use the following pieces of context to answer the question. If no context provided, answer like a AI assistant.
{context}
Question: {question} [/INST]
"""

retriever = vector_index.as_retriever(
        search_kwargs={"k": 6}
    )

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,     
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    }
)



In [None]:
result = qa_chain({"query": "What models use human instructions?"})
print(result)

In [11]:
########## THE LAMA 2 DEMO ############## - MORE GENERIC AND CUSTOMIZED 
langchain.debug=False 

llama_docs_template = """
[INST]Use the following pieces of context to answer the question. If no context provided, answer like a AI assistant.
{context}
Question: {question} [/INST]
"""
llama_docs_prompt = PromptTemplate(template=llama_docs_template, input_variables=["context", "question"])
llama_doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt= llama_docs_prompt, document_variable_name="context", verbose=False)

llama_condense_template = """
[INST]Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question: [/INST]"""
llama_condense_prompt = PromptTemplate(template=llama_condense_template, input_variables=["chat_history", "question"])
llama_question_generator_chain = LLMChain(llm=llm, prompt=llama_condense_prompt, verbose=False)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

llama_v2_chain = ConversationalRetrievalChain(
    retriever=vector_index.as_retriever(search_kwargs={'k': 6}),
    question_generator=llama_question_generator_chain,
    combine_docs_chain=llama_doc_chain,
    memory=memory
)



In [13]:
print(llama_v2_chain({"question": "What models use human instructions?"}))

print(llama_v2_chain({"question": "Which are the advantage of each of these models?"}))

print(llama_v2_chain({"question": "What are the downsides of your last model suggested above ?"}))




 Which models use human instructions?

According to the conversation, the following models use human instructions:

1. LLaV A
2. MiniGPT4

These models leverage human instructions to improve their performance in various visual understanding and reasoning tasks, such as text recognition, TextVQA, KIE, and HMER.</s>





Based on the provided context, the following models use human instructions:

1. LLaV A: This model utilizes a multimodal instruction-tuned dataset to train the language model and projection layer, which enables it to complete various visual understanding and reasoning tasks.
2. MiniGPT4: This model shares a similar network architecture with LLaV A but employs ViT-G/14 and Q-Former to encode images. It only updates the projection layer on a combined dataset of 5 million image-text pairs sampled from LAION, Conceptual Captions, and SBU, and then on 3.5K high-quality conversational data to enhance the naturalness and usability of the generated responses.

Both LLaV A and MiniGPT4 leverage human instructions to improve their performance in various visual understanding and reasoning tasks.</s>
{'question': 'What models use human instructions?', 'chat_history': [HumanMessage(content='What models use human instructions?'), AIMessage(content='Based on the provided text, the following models u



 Which are the advantages of each of the models that use human instructions?</s>





Based on the given context, the advantages of each of the models that use human instructions are:

1. LLaV A: LLaV A uses instruction-following data to enhance the naturalness and usability of the generated responses. This allows the model to generate more accurate and relevant responses to given prompts.
2. MiniGPT4: MiniGPT4 employs ViT-G/14 and Q-Former in [16] to encode images. This allows the model to generate more accurate and relevant responses to given prompts.
3. mPLUG-Owl: mPLUG-Owl integrates unimodal and multimodal data during training. This allows the model to learn a more robust representation of the input data, leading to better performance on OCR tasks.
4. BLIP-2: BLIP-2 uses a combination of regular, irregular, occluded, and others to train the model. This allows the model to learn a more robust representation of the input data, leading to better performance on OCR tasks.

In general, the use of human instructions in these models allows them to better understand the c



 The downsides of the last model suggested (LLaV A) are:

1. Limited dataset size: LLaV A utilizes a small dataset of 595K image-text pairs for training, which may limit its performance on tasks that require a larger amount of data.
2. Modular architecture: LLaV A's modular architecture can make it more difficult to optimize and train, as the language model and image-text matching module may have different optimization goals and requirements.
3. Dependence on high-quality instructions: LLaV A's performance relies heavily on the quality and relevance of the instructions provided. If the instructions are noisy or biased, the model may not perform well.
4. Limited generalization: LLaV A is trained on a specific dataset and may not generalize well to other datasets or tasks.
5. Requires careful tuning: LLaV A requires careful tuning of the hyperparameters and training parameters to achieve good performance. This can be time-consuming and may require expert knowledge.</s>




The article discusses the performance of large multimodal models (LMMs) in various tasks, including text recognition, visual question answering, key information extraction, and handwritten mathematical expression recognition. The study finds that while LMMs show strengths in text recognition and comprehension tasks, they struggle with fine-grained feature detection in images and recognize certain text images as more common words based on semantics. The article also highlights the limitations of LLaV A, a multimodal instruction-tuned model, including its dependence on high-quality instructions, limited generalization, and requirement for careful tuning.

Based on the findings, the article concludes that LMMs work for OCR due to their pre-trained visual encoder and Large Language Model, which operate within their structured feature space. However, unsupervised text-image pairs in the training data cannot compete with fully supervised data. The article suggests that adapting LMMs to the O

In [None]:
########## With ConversationalRetrievalChain 1 ############
doc_chain_custom_prompt="""Given the following extracted parts of a long document and a question, create a final answer with "SOURCES" that represent exactly the Source name and link given.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}

{summaries}

FINAL ANSWER:"""


qa_prompt = PromptTemplate(
    template=doc_chain_custom_prompt, input_variables=["summaries", "question"]
)

question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)


doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt= qa_prompt, verbose=True)


memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

chain = ConversationalRetrievalChain(
    retriever=vector_index.as_retriever(search_kwargs={'k': 6}),
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    memory=memory
)





In [None]:
######## just load document chain question ############
doc_chain_custom_template="""Given chunks from multiple documents and a question, create an answer to the question that references those documents as "SOURCES.

- If the question asks about the system's capabilities, the system should respond with some version of "This system can answer questions about building AI-powered products across the stack, about large language models, and the Full Stack's courses and materials.". The answer does not need to include sources.
- If the answer cannot be determined from the chunks or from these instructions, the system should not answer the question. The system should instead return "No relevant sources found".
- Chunks are taken from the middle of documents and may be truncated or missing context.
- Documents are not guaranteed to be relevant to the question.
- Always include the "SOURCES" in your answer otherwise it is not valid.

QUESTION: {question}
=========
{sources}
=========
FINAL ANSWER:"""

doc_chain_custom_prompt = PromptTemplate(
    template=doc_chain_custom_template, input_variables=["sources", "question"]
)

import prompts
load_doc_chain_chain = load_qa_with_sources_chain(
        llm,
        chain_type="stuff",
        verbose=True,
        prompt=doc_chain_custom_prompt, # prompts.main,
        document_variable_name="sources",
    )


def sim_que(query : str):
    pretty_log("selecting sources by similarity to query")
    sources_and_scores = vector_index.similarity_search_with_score(query, k=3)

    sources, scores = zip(*sources_and_scores)
    #print(sources_and_scores)


    result = load_doc_chain_chain(
            {"input_documents": sources, "question": query}, return_only_outputs=True
        )

    answer = result["output_text"]
    print(answer)

query1 = "What models use human instructions?"
sim_que(query1)

query2 = "Are there any model trained on medical knowledge?"
sim_que(query2)


In [None]:
# doc_chain({"question":"What models use human instructions?", "input_documents":sources})

chain({"question": "What models use human instructions?"})

chain({"question": "Which are the advantage of each of these models?"})

chain({"question": "Can you elaborate more on point 3?"})


In [None]:
chain({"question":"Which are the advantage of each of these models?"})

In [None]:
test.qanda_llama2_cont()

In [None]:
import time 

chat_history = ""
input_list = [{"user_input": "Give me some indications to solve a denial of service attack.", "chat_history":chat_history}]

start=time.time()
test.llm_chain.generate(input_list)

test.llm_chain.generate([{"user_input" : "What question did I asked you previously"}])

end=time.time()

print(f"Total time: {end-start}")

In [None]:
import time 

chat_history = ""
input_list = [{"user_input": "Give me some indications to solve a denial of service attack.", "chat_history":chat_history}]

start=time.time()
answer1=test.llm_chain.predict(user_input="Give me some indications to solve a denial of service attack.")
print(answer1)
answer2=test.llm_chain.predict(user_input="What question did I asked you previously")
print(answer2)
end=time.time()

print(f"Total time: {end-start}")

In [None]:
#response = test.qanda_llama2("Can we combine LMMs and OCR?", with_logging=True)


In [None]:
#response = test.qanda_llama2_withRAG("Can we combine LMMs and OCR?", with_logging=True)

In [None]:
#test.qanda_llama2_withRAG("Can we combine LLMs and OCR", with_logging=True)


In [None]:
#test.ask_question_withRAG("Can we combine LLMs and OCR", with_logging=False)

In [None]:
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema.runnable  import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable.utils import ConfigurableField



In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=test.embedding_engine)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

prompt = hub.pull("rlm/rag-prompt-llama")
llm=test.base_llm

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


########### 
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnableParallel

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use 4 sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = ChatPromptTemplate.from_template(template)

#Always say "thanks for asking!" at the end of the answer.

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),        
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}


In [None]:
rag_chain_with_source.invoke({question:"What is Task Decomposition?"})#"Always say ```thanks for asking!``` at the end of the answer."})

In [None]:
print(
    prompt.invoke(
        {"context": "filler context", "question": "filler question"}
    ).to_string()
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline
streamer = TextStreamer(test.tokenizer, skip_prompt=True)
pipe = pipeline(
    "text-generation",
    model=test.base_llm,
    tokenizer=test.tokenizer,
    max_length=2048,
    temperature=0.6,
    pad_token_id=test.tokenizer.eos_token_id,
    top_p=0.95,
    repetition_penalty=1.2,
    device=0,
    streamer=streamer
)
pipe(prompts[0])

inputs = test.tokenizer(prompts[0], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model_fintuned.generate(**inputs, streamer=streamer, pad_token_id=tokenizer.eos_token_id, max_length=248, temperature=0.8, top_p=0.8,
                        repetition_penalty=1.25)