In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.31.0 --progress-bar off
!pip install -qqq langchain==0.0.266 --progress-bar off
!pip install -qqq chromadb==0.4.5 --progress-bar off
!pip install -qqq pypdf==3.15.0 --progress-bar off
!pip install -qqq xformers==0.0.20 --progress-bar off
!pip install -qqq sentence_transformers==2.2.2 --progress-bar off
!pip install -qqq InstructorEmbedding==1.0.1 --progress-bar off
!pip install -qqq pdf2image==1.16.3 --progress-bar off

In [None]:
!wget -q https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl

In [None]:
!pip install -qqq auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl --progress-bar off

[0m

In [None]:
!sudo apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [None]:
import torch
from auto_gptq import AutoGPTQForCausalLM
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from transformers import AutoTokenizer, TextStreamer, pipeline

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import LLMResult
from typing import Any, Union,Dict, List
from queue import SimpleQueue

q = SimpleQueue()

job_done = object() # signals the processing is done

class StreamingGradioCallbackHandler(BaseCallbackHandler):
    def __init__(self, q: SimpleQueue):
        self.q = q

    def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when LLM starts running. Clean the queue."""
        while not self.q.empty():
            try:
                self.q.get(block=False)
            except q.empty():
                continue

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        """Run on new LLM token. Only available when streaming is enabled."""
        self.q.put(token)

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        self.q.put(job_done)

    def on_llm_error(
        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
    ) -> None:
        """Run when LLM errors."""
        self.q.put(job_done)

## Data

In [None]:
meta_images = convert_from_path("pdfs/meta-earnings-report.pdf", dpi=88)
nvidia_images = convert_from_path("pdfs/nvidia-earnings-report.pdf", dpi=88)
tesla_images = convert_from_path("pdfs/tesla-earnings-report.pdf", dpi=88)
loader = PyPDFDirectoryLoader("pdfs")
docs = loader.load()
len(docs)

100

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)

355

In [None]:
%%time
db = Chroma.from_documents(texts, embeddings, persist_directory="db")

CPU times: user 20.5 s, sys: 404 ms, total: 20.9 s
Wall time: 24.7 s


## Llama 2 13B

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
model_basename = "model"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    revision="gptq-4bit-128g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    inject_fused_attention=False,
    device=DEVICE,
    quantize_config=None,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.


In [None]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

In [None]:
from queue import Queue

class StreamingGradioCallbackHandler(BaseCallbackHandler):
    def __init__(self, q: Queue,job_done : object):
        self.q = q
        self.job_done=job_done

    def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when LLM starts running. Clean the queue."""
        while not self.q.empty():
            try:
                self.q.get(block=False)
            except Queue.empty():
                continue

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        """Run on new LLM token. Only available when streaming is enabled."""
        self.q.put(token)

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        self.q.put(self.job_done)


    def on_llm_error(
        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
    ) -> None:
        """Run when LLM errors."""
        self.q.put(self.job_done)

In [None]:
from transformers import AutoTokenizer, TextStreamer, pipeline

def generate_response(user_input,history):
  # print("@@@@@@@@@@@")

  global COUNT,url,last_url,chain_arabic,chain_english
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
  text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)


  SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."
  print(SYSTEM_PROMPT)
  template = generate_prompt(
      """
  {context}

  Question: {question}
  """,
      system_prompt=SYSTEM_PROMPT,
  )
  # print("template = ", template)
  llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})
  prompt = PromptTemplate(template=template, input_variables=["context", "question"])

  qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)
  print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
  # result = qa_chain("What is the per share revenue for Nvidia during 2023?")
  # print(r"[+] result = {result}")
  # result =
  return qa_chain(user_input)


In [None]:
# import gc
# torch.cuda.empty_cache()
# gc.collect()

0

In [None]:
!pip install gradio==3.48.0

In [None]:
import gradio as gr
with gr.Blocks(fill_height=True) as demo:

    chatbot = gr.ChatInterface(fn=generate_response)

demo.queue().launch(share=True,debug = True)

  with gr.Blocks(fill_height=True) as demo:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://5f219ce76c406d58e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5f219ce76c406d58e5.gradio.live




In [None]:
generate_response("What is the per share revenue for Nvidia during 2023?")

## Chat with Multiple PDFs

In [None]:
import gradio as gr
from threading import Thread

from gradio.themes.utils.colors import Color


DESCRIPTION = """
# Llama2 13B Chat 🗨️
This is a streaming Chat Interface implementation of [Llama2](https://huggingface.co/meta-llama)
"""
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])

# gr.Interface(fn=generate_response,
#              inputs = ["text"],
#              outputs = ["text"]).launch(debug = True , share =True)


msg.submit(respond, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    demo.launch()



NameError: name 'respond' is not defined

In [None]:
len(result["source_documents"])

In [None]:
print(result["source_documents"][0].page_content)

In [None]:
result["source_documents"][0]

In [None]:
result = qa_chain("What is the per share revenue for Tesla during 2023?")

In [None]:
result = generate_response("What is the per share revenue for Nvidia during 2023?")

In [None]:
print(result["source_documents"][1].page_content)

In [None]:
result = qa_chain("What is the estimated YOY revenue for Meta during 2023?")

In [None]:
result = qa_chain("What is the estimated YOY revenue for Tesla during 2023?")

In [None]:
result = qa_chain("What is the estimated YOY revenue for Nvidia during 2023?")

In [None]:
result = qa_chain(
    "Which company is more profitable during 2023 Meta, Nvidia or Tesla and why?"
)

In [None]:
result = qa_chain(
    "Choose one company to invest (Tesla, Nvidia or Meta) to maximize your profits for the long term (10+ years)?"
)

## References

- [Tesla Quarterly Report (Jul 21, 2023)](https://ir.tesla.com/_flysystem/s3/sec/000095017023033872/tsla-20230630-gen.pdf)
- [Meta Q2 2023 Earnings (Jul 26, 2023)](https://s21.q4cdn.com/399680738/files/doc_financials/2023/q2/Meta-06-30-2023-Exhibit-99-1-FINAL.pdf)
- [Nvidia Fiscal Q1 2024](https://s201.q4cdn.com/141608511/files/doc_financials/2024/q1/ecefb2b2-efcb-45f3-b72b-212d90fcd873.pdf)

# Summarization :

In [None]:
summary = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

llm_summary = HuggingFacePipeline(pipeline=summary, model_kwargs={"temperature": 0})
qa_chain_summary = RetrievalQA.from_chain_type(
    llm=llm_summary,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True
    # chain_type_kwargs={"prompt": prompt},
)
result_summary = qa_chain_summary("What is the per share revenue for Meta during 2023?")


In [None]:

!nvidia-smi