# Setting up Google Colab and Hugging Face API

## Setting up Google Colab
1. **Open Google Colab**: Go to [Google Colab](https://colab.research.google.com/) in your browser.
2. **Create a new notebook**: Click on "File" -> "New Notebook".
3. **Connect to runtime**: Click on the "Connect" button at the top right corner of the screen. This will connect your notebook to a virtual machine with GPU support.
4. **Set GPU as hardware accelerator**: Go to "Runtime" -> "Change runtime type" and select "GPU" from the hardware accelerator dropdown menu. This will ensure your notebook is using a GPU, which is highly recommended for working with large language models like Llama 2.
5. **Install required packages**: Run the installation commands provided in the script to install all necessary dependencies for the project. These include `torch`, `transformers`, `langchain`, and others.

## Getting a Hugging Face API Token
1. **Create a Hugging Face account**: Go to [Hugging Face](https://huggingface.co/) and create an account if you don’t already have one.
2. **Generate an API Token**: After logging in, click on your profile icon in the top right corner, and go to "Settings".
3. **Access Tokens**: On the settings page, navigate to the "Access Tokens" tab.
4. **Create a new token**: Click on "New Token", give it a name, and set the role to "write". This token will be used to authenticate and download models.
5. **Copy the Token**: Copy the generated token and replace the `auth_token` variable in the script with your token.

---


In [None]:

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
!pip install langchain einops accelerate transformers bitsandbytes scipy
!pip install xformers sentencepiece 
!pip install llama-index==0.7.21 llama_hub==0.0.19

# Import transformer classes for generaiton
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# Import torch for datatype attributes 
import torch

# Define variable to hold llama2 weights naming 
name = "meta-llama/Llama-2-70b-chat-hf"
# Set auth token variable from hugging face 
auth_token = "YOUR HUGGING FACE AUTH TOKEN HERE"

# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(name, 
    cache_dir='./model/', use_auth_token=auth_token)

# Create model
model = AutoModelForCausalLM.from_pretrained(name, 
    cache_dir='./model/', use_auth_token=auth_token, torch_dtype=torch.float16, 
    rope_scaling={"type": "dynamic", "factor": 2}, load_in_8bit=True) 

# Setup a prompt 
prompt = "### User:What is the fastest car in  \
          the world and how much does it cost? \
          ### Assistant:"

# Pass the prompt to the tokenizer
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Setup the text streamer 
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Actually run the thing
output = model.generate(**inputs, streamer=streamer, 
                        use_cache=True, max_new_tokens=float('inf'))

# Covert the output tokens back to text 
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Import the prompt wrapper...but for llama index
from llama_index.prompts.prompts import SimpleInputPrompt

# Create a system prompt 
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as 
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain 
why instead of answering something not correct. If you don't know the answer 
to a question, please don't share false information.

Your goal is to provide answers relating to the financial performance of 
the company.<</SYS>>
"""

# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

# Complete the query prompt
query_wrapper_prompt.format(query_str='hello')

# Import the llama index HF Wrapper
from llama_index.llms import HuggingFaceLLM

# Create a HF LLM using the llama index wrapper 
llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=256,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding

# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Create and dl embeddings instance  
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)

# And set the service context
set_global_service_context(service_context)

# Import deps to load documents 
from llama_index import VectorStoreIndex, download_loader
from pathlib import Path

# Download PDF Loader 
PyMuPDFReader = download_loader("PyMuPDFReader")

# Create PDF Loader
loader = PyMuPDFReader()

# Load documents 
documents = loader.load(file_path=Path('./data/annualreport.pdf'), metadata=True)

# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

# Setup index query engine using LLM 
query_engine = index.as_query_engine()

# Test out a query in natural
response = query_engine.query("what was the FY2022 return on equity?")
