In [3]:
import os
import time
import openai
import pinecone
from tqdm.auto import tqdm
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from nemoguardrails import LLMRails, RailsConfig

from dotenv import load_dotenv
load_dotenv()

True

In [34]:
data = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)

In [35]:
data = data.map(lambda x: {
    'uid': f"{x['doi']}-{x['chunk-id']}"
})
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references', 'uid'],
    num_rows: 4838
})

In [36]:
data=data.to_pandas()

In [37]:
data[['uid', 'chunk', 'title', 'source']]

Unnamed: 0,uid,chunk,title,source
0,1102.0183-0,High-Performance Neural Networks\nfor Visual O...,High-Performance Neural Networks for Visual Ob...,http://arxiv.org/pdf/1102.0183
1,1102.0183-1,"January 2011\nAbstract\nWe present a fast, ful...",High-Performance Neural Networks for Visual Ob...,http://arxiv.org/pdf/1102.0183
2,1102.0183-2,promising architectures for such tasks. The mo...,High-Performance Neural Networks for Visual Ob...,http://arxiv.org/pdf/1102.0183
3,1102.0183-3,"Mutch and Lowe, 2008), whose lters are xed, ...",High-Performance Neural Networks for Visual Ob...,http://arxiv.org/pdf/1102.0183
4,1102.0183-4,We evaluate various networks on the handwritte...,High-Performance Neural Networks for Visual Ob...,http://arxiv.org/pdf/1102.0183
...,...,...,...,...
4833,2307.09288-315,"BytheCentralLimitTheorem, Zntendstowardsastand...",Llama 2: Open Foundation and Fine-Tuned Chat M...,http://arxiv.org/pdf/2307.09288
4834,2307.09288-316,Table 52 presents a model card (Mitchell et al...,Llama 2: Open Foundation and Fine-Tuned Chat M...,http://arxiv.org/pdf/2307.09288
4835,2307.09288-317,models will be released as we improve model sa...,Llama 2: Open Foundation and Fine-Tuned Chat M...,http://arxiv.org/pdf/2307.09288
4836,2307.09288-318,Training Factors We usedcustomtraininglibrarie...,Llama 2: Open Foundation and Fine-Tuned Chat M...,http://arxiv.org/pdf/2307.09288


In [38]:
data['chunk'][0]

'High-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nTechnical Report No. IDSIA-01-11\nJanuary 2011\nIDSIA / USI-SUPSI\nDalle Molle Institute for Arti\x0ccial Intelligence\nGalleria 2, 6928 Manno, Switzerland\nIDSIA is a joint institute of both University of Lugano (USI) and University of Applied Sciences of Southern Switzerland (SUPSI),\nand was founded in 1988 by the Dalle Molle Foundation which promoted quality of life.\nThis work was partially supported by the Swiss Commission for Technology and Innovation (CTI), Project n. 9688.1 IFF:\nIntelligent Fill in Form.arXiv:1102.0183v1  [cs.AI]  1 Feb 2011\nTechnical Report No. IDSIA-01-11 1\nHigh-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nJanuary 2011\nAbstract\nWe present a fast, fully parameterizable G

In [39]:
from openai import OpenAI
client = OpenAI()

In [41]:
embed_model = "text-embedding-ada-002"
res = client.embeddings.create(
    input=[
        data['chunk'][0],
        data['chunk'][1]
    ],
    model=embed_model
)

In [81]:
len(res.data[0].embedding)

1536

In [6]:
pc = Pinecone(api_key=os.environ.get("PINECONE_KEY"))

In [40]:
index_name = "nemo-guardrails-rag-action"

In [7]:
pc.list_indexes().names()

['nemo-guardrails-rag-action']

In [86]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=len(res.data[0].embedding),
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-west-2")
    )

    while not pc.describe_index(index_name).status('ready'):
        time.sleep(1)

In [8]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

In [89]:
batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    # find end of batch
    i_end = min(len(data), i+batch_size)
    batch = data[i:i_end]
    # get ids
    ids_batch = batch['uid'].to_list()
    # get texts to encode
    texts = batch['chunk'].to_list()
    # create embeddings
    res = res = client.embeddings.create(
    input=texts,
    model=embed_model
)
    embeds = [record.embedding for record in res.data]
    # create metadata
    metadata = [{
        'chunk': x['chunk'],
        'source': x['source']
    } for _, x in batch.iterrows()]
    to_upsert = list(zip(ids_batch, embeds, metadata))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

100%|██████████| 49/49 [05:12<00:00,  6.39s/it]


In [90]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

##### TEST PINECONE VEC STORE

In [14]:
xq = client.embeddings.create(
    input=['tell me about llama 2'],
    model="text-embedding-ada-002"
)

In [17]:
xq.data[0].embedding

1536

In [25]:
l = index.query(
    vector=xq.data[0].embedding,
    top_k=3,
    include_metadata=True
)

In [None]:
l['matches'][0]

In [26]:
contexts=[x['metadata']['chunk'] for x in l['matches']]

In [44]:
async def retrieve(query: str) -> list:
    embed_query = client.embeddings.create(input=[query], model=embed_model)
    query_emdns = embed_query.data[0].embedding
    res = index.query(vector=query_emdns, top_k=5, include_metadata=True)
    contexts=[x['metadata']['chunk'] for x in res['matches']]
    return contexts



In [47]:
use = await retrieve('llama 2')

In [51]:
# use langchain to create prompt template
async def rag(query: str, contexts: list) -> str:
    print("> RAG CALLED")
    context_str = '\n'.join(contexts)
    system = f"""You are a helpful assistant designed to output JSON. Given a query from a user and
    some relevant contexts. Answer the question given the information in those
    contexts. If you cannot find the answer to the question, say "I don't know".
    """
    user = f"""
    {context_str}

    Query: {query}
    """
    messages = [
        {"role":"system", "content":system},
        {"role":"user", "content":user}
    ]
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={ "type": "json_object" },
        messages=messages
    )
    return response.choices[0].message.content


In [52]:
await rag(query="tell me about llama 2", contexts=use)

> RAG CALLED


'{\n    "answer": "Llama 2 is a collection of pretrained and fine-tuned large language models ranging from 7 billion to 70 billion parameters. The fine-tuned LLMs in Llama 2 are optimized for dialogue use cases and outperform open-source chat models on most benchmarks. They may be a suitable substitute for closed-source models based on humane evaluations for helpfulness and safety.",\n    "source": "Llama 2 abstract"\n}'

##### GUARDRAILS

In [53]:
yaml_content = """
models:
- type: main
  engine: openai
  model: gpt-3.5-turbo-0125
"""

colang_content = """
# define limits
define user ask politics
    "what are your political beliefs?"
    "thoughts on the president?"
    "left wing"
    "right wing"

define bot answer politics
    "I'm a simple assistant, I don't like to talk of politics."
    "Sorry I can't talk about politics!"

define flow politics
    user ask politics
    bot answer politics
    bot offer help

# define RAG intents and flow
define user ask llm
    "tell me about llama 2?"
    "what is large language model"
    "where did meta's new model come from?" 
    "how to llama?"
    "have you ever meta llama?"

define flow llm
    user ask llm
    $contexts = execute retrieve(query=$last_user_message)
    $answer = execute rag(query=$last_user_message, contexts=$contexts)
    bot $answer
"""

In [54]:
config = RailsConfig.from_content(
    colang_content=colang_content,
    yaml_content = yaml_content
) 
rails = LLMRails(config)

In [55]:
rails.register_action(action=retrieve, name="retrieve")
rails.register_action(action=rag, name="rag")

In [56]:
await rails.generate_async(prompt='Hello')


'Hello! How can I assist you today?'

In [57]:
await rails.generate_async(prompt='tell me about llama 2')

> RAG CALLED


'{\n    "answer": "Llama 2 is a collection of pretrained and fine-tuned large language models ranging from 7 billion to 70 billion parameters. The fine-tuned LLMs are optimized for dialogue use cases and outperform open-source chat models on most benchmarks tested. They may be a suitable substitute for closed-source models based on human evaluations for helpfulness and safety. Llama 2 models are developed and released by GenAI and Meta.",\n    "source": "GenAI, Meta"\n}'