# Retrival Augmented Generation with LLaMA 2 
------

💡 For a more indepth explanation on what's covered in this notebook head to the accompanying [blog post]()      

In [1]:
import pinecone
import torch

torch.cuda.is_available()

True

### Initializing the Hugging Face Embedding Pipeline

In [2]:
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Select our embedding model to map inputs to a vector space
embed_model_id = 'sentence-transformers/all-MiniLm-l6-v2'
# Ensure we are using a GPU for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Define Batch size for the embedding model
batch_size = 32
# Load the model onto our GPU
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': batch_size}
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
# Let's now use the model to embed two sentences 

docs = [
    "this is one document",
    "and another document"
]

# Embeddings will be a list where each element contains nested list of 384 values
embeddings = embed_model.embed_documents(docs)
# Extract the number of dimensions per sentence
number_of_dimensions = len(embeddings[0])

print(f"We have {len(embeddings)} embeddings, each with {number_of_dimensions} dimensions.")

We have 2 embeddings, each with 384 dimensions.


### Building the Vector Index

In order for the model to successfully retrieve our information we will need to store our embeddings in a vector database. To do this it is recommended that you use [Pinecones free tier](https://app.pinecone.io/)

In [4]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, PodSpec

# Load API Key
load_dotenv()
api_key = os.environ.get("PINECONE_API_KEY")

# Instantiate the Pinecone client with the API Key
pinecone = Pinecone(
    api_key=api_key
)

In [5]:
# Create the index 
index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes().names():
    pinecone.create_index(
        name=index_name,
        dimension=number_of_dimensions,
        metric='cosine',
        spec=PodSpec(environment="gcp-starter")
    )

In [6]:
# Check if the index is ready to use 
if pinecone.describe_index(index_name).status['ready']:
    print("Ready to go!")
    
    # Connect to the index 
    index = pinecone.Index(index_name)
    print(index.describe_index_stats())

Ready to go!
{'dimension': 384,
 'index_fullness': 0.04838,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}


### Load Dataset

In [7]:
from datasets import load_dataset

data = load_dataset(
    'jamescalam/llama-2-arxiv-papers-chunked',
    split='train'
)

data = data.to_pandas()

Downloading readme:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# # Iterate through each batch in the data
# for i in range(0, len(data), batch_size):
#     # Calculate the final index for each batch avoiding an index error for the final batch
#     i_end = min(len(data), i + batch_size)
#     # Extract the current batch
#     batch = data.iloc[i:i_end]
#     # Create a unique ID from doi + chunk_id
#     ids = [f"{row['doi']}-{row['chunk-id']}" for _, row in batch.iterrows()]
#     # Extract Text data and create embeddings
#     texts = [row['chunk'] for _, row in batch.iterrows()]
#     embeddings = embed_model.embed_documents(texts)
#     # Generate Meta Data 
#     metadata = [
#         {
#             'text': row['chunk'],
#             'source': row['source'],
#             'title': row['title']
#         } for _, row in batch.iterrows()
#     ]
# 
#     # Upload to Pinecone 
#     index.upsert(vectors=zip(ids, embeddings, metadata))
# 
# index.describe_index_stats()

### Load Model

In [9]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

# Set quantization configuration
quantization_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# Load Hugging Face Token 
load_dotenv()
hugging_face_token = os.environ.get('HF_AUTH_TOKEN')
# Set model configuration
model_config = transformers.AutoConfig.from_pretrained(
    pretrained_model_name_or_path=model_id,
    token=hugging_face_token
)

# Load model with quantization and model configurations
model = transformers.AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=quantization_config,
    device_map='auto',
    token=hugging_face_token
)

# Set model to evaluation mode
model.eval()
print(torch.cuda.is_available())

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



True


In [10]:
# Load the Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=hugging_face_token
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [11]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [12]:
llm(prompt="Explain to me the difference between nuclear fission and fusion.")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [ ]:
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA

# Pinecone requires this field for Metadata
text_field = 'text'  

vectorstore = Pinecone(
    index, 
    embed_model.embed_query, 
    text_field
)


rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

# Use our RAG pipeline
rag_pipeline('what is so special about llama 2?')