In [19]:
!pip3 install -q langchain langchain-openai langchain-community faiss-gpu faiss-cpu transformers_stream_generator sentence-transformers requests torch bitsandbytes transformers sentencepiece accelerate
# !pip3 install   # Use cu117 if on CUDA 11.7


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for transformers_stream_generator (setup.py) ... [?25l[?25hdone


In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.schema import Document
from langchain.memory import VectorStoreRetrieverMemory, ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from huggingface_hub import login
import torch

import os
import glob

In [3]:
torch.cuda.empty_cache()

In [4]:
# from dotenv import load_dotenv

# load_dotenv()
# os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [5]:
from google.colab import drive
from google.colab import userdata

drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [8]:
def get_quant_config():
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        # bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )
    return quant_config

In [9]:
# Load model with quantization
# model_name = "meta-llama/Llama-3.1-8B"
def get_model(model_name=None):
    print(f"model being loaded: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",  # Automatically map to available GPUs or CPUs
        quantization_config=get_quant_config(),  # Enable dynamic quantization
        trust_remote_code=True,  # Use this if the model repository includes custom code
    )
    return model

print("Quantized model loaded successfully!")


Quantized model loaded successfully!


In [10]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase
# knowledge_base_path = 'knowledge-base/*'
knowledge_base_path = '/content/drive/MyDrive/LLM_Engineering/knowledge-base/*'


folders = glob.glob(knowledge_base_path)

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Total number of chunks: 123
Document types found: {'company', 'contracts', 'employees', 'products'}


In [11]:
def get_tokenizer(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
  tokenizer.pad_token = tokenizer.eos_token
  return tokenizer

In [12]:
# Inference can also be done using transformers' pipeline

def get_pipeline_llm(model, tokenizer):

    hf_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        temperature=0.7,
        max_new_tokens=512,
        return_full_text=False  # This can help reduce unnecessary output
        # do_sample=True,
        # top_p=0.95,
        # top_k=40,
        # repetition_penalty=1.1
    )

    llm = HuggingFacePipeline(pipeline=hf_pipeline)

    return llm

In [13]:
# Step 4: Set Up Embeddings Model
def get_embeddings(embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
  return embeddings

In [14]:
# Step 5: Create the FAISS index
def get_vectorstore():
  vectorstore = FAISS.from_documents(chunks, get_embeddings())
  return vectorstore

# Save the FAISS index for future use
# vectorstore.save_local("knowledge_base_index")

In [15]:
# Step 5: Set Up VectorStoreRetrieverMemory
# memory = VectorStoreRetrieverMemory(retriever=vectorstore.as_retriever(), memory_key="chat_history", return_messages=True)

# Create a conversation buffer memory
def get_memory_model():
  memory = ConversationBufferMemory(
      memory_key="chat_history",
      return_messages=True,
      input_key="question",
      output_key="answer"
  )
  return memory

In [16]:
# Step 6: Create the Conversational Retrieval Chain
def generate_conversation_chain(model, tokenizer):
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=get_pipeline_llm(model, tokenizer),
        retriever=get_vectorstore().as_retriever(search_kwargs={"k": 4}),
        # retriever=get_vectorstore().as_retriever(search_kwargs={"k": 2}),
        # retriever=vectorstore.as_retriever(search_kwargs={"k": 25}),
        memory=get_memory_model(),
        return_source_documents=False,  # Only return the model's answer
        verbose=False  # This can help reduce unnecessary printing
    )
    return conversation_chain

In [17]:
LLAMA="meta-llama/Llama-3.1-8B"
GEMMA="google/gemma-7b"
QWEN="Qwen/Qwen-7B"

In [20]:
model=get_model(QWEN)
memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory:,.1f} MB")

tokenizer=get_tokenizer(QWEN)

model being loaded: Qwen/Qwen-7B


qwen_generation_utils.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B:
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


cpp_kernels.py:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B:
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B:
- qwen_generation_utils.py
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Memory footprint: 5,730.7 MB


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

The repository for Qwen/Qwen-7B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Qwen/Qwen-7B.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


tokenization_qwen.py:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


qwen.tiktoken:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

In [21]:
conversation_chain = generate_conversation_chain(model, tokenizer)

  llm = HuggingFacePipeline(pipeline=hf_pipeline)
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  memory = ConversationBufferMemory(


In [22]:
chat_history = []

In [23]:
# def chat(question, history):
#     result = conversation_chain.invoke({"question": question, "chat_history": chat_history})
#     return result.get("answer", "No answer available")

def chat(question, history):
    # Convert history to the format expected by the chain
    chat_history = [(h[0], h[1]) for h in history]

    result = conversation_chain({
        "question": question,
        "chat_history": chat_history
    })

    return result.get("answer", "No answer available")

In [None]:
# Example Queries

# response = chat("Who is Avery", chat_history)
# print("A:", response)

In [None]:
# response = chat("Who received the prestigious IIOTY award in 2023", chat_history)
# print("A:", response)

In [None]:
# response = chat("Who is Avery?", chat_history)
# print("A:", response)

In [24]:
!pip3 install -q gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.2/168.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
import gradio as gr
view = gr.ChatInterface(chat).launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5a221a4225a67fc770.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
