In [1]:
!nvidia-smi


Fri Dec 15 00:56:50 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off | 00000000:01:00.0 Off |                    0 |
| N/A   24C    P0              61W / 500W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          Off | 00000000:41:00.0 Off |  

### Initialize Pinecone vectorstore

In [2]:
import os
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

index_name = "business-listings"
text_field = "document"

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")
# Initialize index
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment="us-east-1-aws"
)

index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embedder.embed_query, text_field
)


  from tqdm.autonotebook import tqdm


In [3]:
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.9,
 'namespaces': {'': {'vector_count': 977159}},
 'total_vector_count': 977159}

### Load LLM

In [4]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=True
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=True
)
model.eval()
print(f"Model loaded on {device}")




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


### Initialize pipeline to use in chain

In [5]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=1024,  # mex number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating,
    do_sample=False
)
llm = HuggingFacePipeline(pipeline=generate_text)


### Define additional prompt to apply to the context after retrieval

In [10]:
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


custom_template = """You are helpful recommender AI Assistant. Given the
following conversation and a follow up question, rephrase the follow up question
to be a standalone question. At the end of standalone question add this
'Answer the question in English language.' If you do not know the answer reply with 'I am sorry, I dont have enough information'.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:
"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)


In [11]:
import warnings
warnings.filterwarnings('ignore')


### Initialize Conversational Retrieval Chain and run chatbot

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    memory=memory,
    condense_question_prompt=CUSTOM_QUESTION_PROMPT,
    verbose=True
)
chat_history = []
print("My personal recommender:")
print("=" * 100 + "\n\n\n")
while True:    
    query = input('User: ')
    if query == "exit" or query == "quit" or query == "q":
        print('Exiting')
        break

    result = qa_chain({"question": query})
    # result = qa_chain({'question': system_inst + query, 'chat_history': chat_history})
    print('Assistant: ' + result['answer'] + '\n')
    chat_history.append((query, result['answer']))


My personal recommender:





User:  What are some great restaurants in College Park to take my crush out on a date?




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

### Address: Stanford Grill, 8900 Stanford Blvd, Columbia, MD 21045, United States
### Name: Stanford Grill
### Review: Favorite date night spot of ours: live jazz most evenings, a pleasant bar, fantastic dinners and drinks, and wonderful waiters. Not cheap, especially if you get drinks, appetizers, or desserts, but all around an excellent dining experience that's a serious step above most eating-out options. Twice as nice if you can get Steve as your waiter... seriously, the guy almost single-handedly sold us on making this our regular fancy date night spot, he's just that nice.
### Average Rating: 4.60
### Hours: [['Sunday', '12–8PM'], ['Monday', '12–8PM'], ['Tuesday', '12–8PM'], ['Wednesda