<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/code_understanding_mistral7b_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
%%capture
!pip install transformers bitsandbytes accelerate langchain langchain_community sentence-transformers GitPython langchain_chroma langchain-huggingface

In [2]:
import os
from google.colab import drive

drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Kaggle"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Kaggle


In [20]:
%%capture
def buildLLM():
  from transformers import AutoModelForCausalLM, AutoTokenizer
  import transformers
  from transformers import BitsAndBytesConfig
  from torch import cuda, bfloat16
  from langchain.llms import HuggingFacePipeline

  model_name='mistralai/Mistral-7B-Instruct-v0.1'

  model_config = transformers.AutoConfig.from_pretrained(
      model_name,
  )

  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=bfloat16,
  )

  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      quantization_config=bnb_config,
  )

  text_generation_pipeline = transformers.pipeline(
      model=model,
      tokenizer=tokenizer,
      task="text-generation",
      temperature=0.01,
      repetition_penalty=1.3,
      return_full_text=True,
      max_new_tokens=1000,
      do_sample=True,
  )
  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
  return llm

In [11]:
from git import Repo
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language

# !mkdir datadir
repo_path = "/content/datadir"
# repo = Repo.clone_from("https://github.com/langchain-ai/langchain", to_path=repo_path)

In [15]:
loader = GenericLoader.from_filesystem(
    repo_path + "/libs/core/langchain_core",
    glob="**/*",
    suffixes=[".py"],
    exclude=["**/non-utf8-encoding.py"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
)
documents = loader.load()
len(documents)

349

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
len(texts)

1050

In [18]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

db = Chroma.from_documents(texts, HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))
retriever = db.as_retriever(
    search_type="similarity",  # Also test "mmr"
    search_kwargs={"k": 8},
)

In [21]:
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = buildLLM()

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages(
    [
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
        (
            "user",
            "Given the above conversation, generate a search query to look up to get information relevant to the conversation",
        ),
    ]
)

retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
    ]
)
document_chain = create_stuff_documents_chain(llm, prompt)

qa = create_retrieval_chain(retriever_chain, document_chain)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  warn_deprecated(


In [23]:
import torch
import gc
questions = [
    "What classes are derived from the Runnable class?",
    "What one improvement do you propose in code in relation to the class hierarchy for the Runnable class?",
]

for question in questions:
    result = qa.invoke({"input": question})
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n", end='\n\n\n\n\n')
    gc.collect()
    torch.cuda.empty_cache()

-> **Question**: What classes are derived from the Runnable class? 

**Answer**: System: Answer the user's questions based on the below context:

class Runnable(Generic[Input, Output], ABC):
    """A unit of work that can be invoked, batched, streamed, transformed and composed.

     Key Methods

    - **invoke/ainvoke**: Transforms a single input into an output.
    - **batch/abatch**: Efficiently transforms multiple inputs into outputs.
    - **stream/astream**: Streams output from a single input as it's produced.
    - **astream_log**: Streams output and selected intermediate results from an input.

    Built-in optimizations:

    - **Batch**: By default, batch runs invoke() in parallel using a thread pool executor.
             Override to optimize batching.

    - **Async**: Methods with "a" suffix are asynchronous. By default, they execute
             the sync counterpart using asyncio's thread pool.
             Override for native async.

    All methods accept an optional co