In [1]:
!pip install -qU gradio
!pip install pinecone-client

You may need to restart the kernel here if packages were installed

In [2]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY") or "OPENAI_API_KEY"

In [3]:
import tqdm
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or "PINECONE_API_KEY"
# find your environment next to the api key in pinecone console
env = os.getenv("PINECONE_ENVIRONMENT") or "gcp-starter"

In [4]:
pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

WhoAmIResponse(username=None, user_label=None, projectname='14d2815')

In [5]:
index_name = 'cnvrg-openai'
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00464,
 'namespaces': {'': {'vector_count': 464}},
 'total_vector_count': 464}

In [11]:
# https://cismography.medium.com/how-to-integrate-custom-llm-using-langchain-a-gpt4all-example-cfcb6d26fc3
from pydantic import Field
from typing import List, Mapping, Optional, Any
from langchain.llms.base import LLM
import requests

class LLMaaS(LLM):
    """
    A custom LLM class that integrates LLMaaS models
    
    Arguments:

    model_folder_path: (str) Folder path where the model lies
    model_name: (str) The name of the model to use (<model name>.bin)
    allow_download: (bool) whether to download the model or not

    temperature: (str) Temperature to use for sampling
    top_p: (float) The top-p value to use for sampling
    top_k: (float) The top k values use for sampling
    max_new_tokens: (str) The maximum numbers of tokens to generate
    repetition_penalty: (float) The penalty to apply repeated tokens
    
    """
    model_folder_path: str = Field(None, alias='model_folder_path')
    model_name: str = Field(None, alias='model_name')
    allow_download: bool = Field(None, alias='allow_download')

    # # all the optional arguments

    temperature:        Optional[float] = 0.5
    top_p:              Optional[float] = 1
    top_k:              Optional[int]   = 50
    max_new_tokens:     Optional[int]   = 250
    repetition_penalty: Optional[float] = 1

    def __init__(self, model_name, allow_download=False, **kwargs):
        super(LLMaaS, self).__init__()
        self.model_name = model_name
        self.allow_download = allow_download
        self.temperature = kwargs.get("temperature", 0.5)
        self.top_p = kwargs.get("top_p", 1)
        self.top_k = kwargs.get("top_k", 50)
        self.max_new_tokens = kwargs.get("max_new_tokens", 250)
        self.repetition_penalty = kwargs.get("repetition_penalty", 1)
        
    def auto_download(self) -> None:
        """
        This method will download the model to the specified path
        """
        pass
    
    @property
    def _get_model_default_parameters(self):
        return {
            "max_new_tokens": self.max_new_tokens,
            "top_k": self.top_k,
            "top_p": self.top_p,
            "temperature": self.temperature,
            "repetition_penalty": self.repetition_penalty,
        }

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """
        Get all the identifying parameters
        """
        return {
            'model_name' : self.model_name,
            'model_parameters': self._get_model_default_parameters
        }

    @property
    def _llm_type(self) -> str:
        return 'mpt'
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs) -> str:
        """
        Args:
            prompt: The prompt to pass into the model.
            stop: A list of strings to stop generation when encountered

        Returns:
            The string generated by the model        
        """
        headers = {
            'Content-Type': "application/json",
            'x-api-key': "{your LLMaaS token here}"
        }
        url = "https://app.llm.cnvrg.io/api/v1/chat"

        temperature = kwargs.get("temperature", self.temperature)
        top_p = kwargs.get("top_p", self.top_p)
        top_k = kwargs.get("top_k", self.top_k)
        max_new_tokens = kwargs.get("max_new_tokens", self.max_new_tokens)
        repetition_penalty = kwargs.get("repetition_penalty", self.repetition_penalty)

        kwargs.get("data", {})
        payload = {
            "model_uuid": "7e0a4264-4026-4ebb-8eb0-2cdedcace07b",
            "data": {
            "messages": [
                {
                    "text": prompt,
                    "user_role": True
                }
            ],
            "inference_params": {
                "max_new_tokens": max_new_tokens,
                "temperature": temperature,
                "repetition_penalty": repetition_penalty,
                "top_p": top_p,
                "do_sample": "True",
                "num_beams": 0,
                "top_k": top_k
            },
            "system_prompt": "- You are a helpful assistant chatbot trained by Intel.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes"}
        }

        response = requests.post(url, headers=headers, json=payload)
        return response.text

In [7]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA

In [12]:
# This will allow to query a response without having to load files repeatedly.
def data_querying(input_text):
    # We Must reinitialize Pinecone in oder to load our previously created index.
    api_key = os.getenv("PINECONE_API_KEY") or "PINECONE_API_KEY"
    # find your environment next to the api key in pinecone console
    environment = os.getenv("PINECONE_ENVIRONMENT") or "gcp-starter"

    os.environ['PINECONE_API_KEY'] = api_key

    index_name = "cnvrg-openai"
    pinecone.init(api_key=api_key, environment=environment)

    embed_model = 'text-embedding-ada-002'

    embed = OpenAIEmbeddings(
      model=embed_model,
      openai_api_key=openai.api_key
    )

    # load pinecone index for langchain
    index = pinecone.Index(index_name)

    # This text field represents the field that the text contents of your document are stored in
    text_field = "question"

    # load pinecone index for langchain
    index = pinecone.Index(index_name)

    vectorstore = Pinecone(
      index, embed.embed_query, text_field
    )
    # Query the vectorized data
    vectorstore.similarity_search(
      input_text,  # our search query
      k = 2  # return k most relevant docs
    )

    # vectorstore = Pinecone.from_existing_index(
    #     index_name,
    #     embedding=embed, 
    #     namespace="SessionIndex"
    # )

    # Using LangChain we pass in our model for text generation.
    llm = LLMaaS(temperature=0.5, model_name="mpt7b_2bs_5epoch")
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    # Finally we return the result of the search query. 
    response = qa.run(input_text)
    return response

In [None]:
import gradio as gr

#Create your gradio Interface
iface = gr.Interface(fn=data_querying,
                     inputs=gr.inputs.Textbox(lines=7, label="Enter your question"),
                     outputs="text",
                     title="Test LLMaaS RAG")

In [14]:
# Launches Gradio App  
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://06578dde3b19c9beb1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




