In [5]:
!nvidia-smi

Mon May 22 17:40:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-PCIE-40GB      Off  | 00000000:41:00.0 Off |                    0 |
| N/A   30C    P0    36W / 250W |      0MiB / 40536MiB |     31%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
import numpy as np
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel
from accelerate import Accelerator
import time

from pydantic import BaseModel, Extra, Field, root_validator
from typing import Any, List, Optional, Dict, Sequence
from chromadb.utils import embedding_functions


In [7]:
torch.cuda.is_available()

True

In [8]:
shared_dir = '/gpfs/space/projects/stud_ml_22/NLP'

In [9]:
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import Callbacks
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
# from langchain.llms import OpenAI
from langchain.schema import Generation
from langchain.schema import PromptValue, LLMResult
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


# langchain.document_loaders.DataFrameLoader has a quite a limited functionality
class DataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, page_content_columns: List[str]):
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        self.data_frame = data_frame
        self.page_content_columns = page_content_columns

    def load(self) -> List[Document]:
        result = []
        for i, row in self.data_frame.iterrows():
            text = ""
            metadata = {}
            for col in self.page_content_columns:
                data = row[col]
                if isinstance(data, list):
                    text += "".join(data) + "\n"
                elif isinstance(data, str):
                    text += data + "\n"
                else:
                    print(f"[IGNORED] [{i}] [{col}] {data}")

            metadata_temp = row.to_dict()
            for col in self.page_content_columns:
                metadata_temp.pop(col)
            # Metadata is a dict where a value can only be str, int, or float. Delete other types.
            for key, value in metadata_temp.items():
                if isinstance(value, (str, int, float)):
                    metadata[key] = value

            result.append(Document(page_content=text, metadata=metadata))
        return result

In [10]:
def get_model(encoder_only = False):
        PATH_TO_CONVERTED_WEIGHTS = os.path.join(
            shared_dir, "llama/13B_Vicuna_added/")

        device = torch.device(
            "cuda") if torch.cuda.is_available() else torch.device("cpu")

        config = AutoConfig.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        config.max_position_embeddings = 1024

        model = AutoModelForCausalLM.from_pretrained(
            PATH_TO_CONVERTED_WEIGHTS,
            config=config,
            trust_remote_code=True,
            # use_cache=not args.no_gradient_checkpointing,
            load_in_8bit=True,
            device_map={"": Accelerator().process_index},
            # device_map="auto"
        )
        
        return model

In [28]:
from langchain.schema import BaseMessage, LLMResult, PromptValue, get_buffer_string

class LlamaWrapperModel(BaseLanguageModel):
    model: Any
    
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        values['model'] = get_model()
        # values['model'] = 'model'
        return values
    
    def predict(self, text: str, *, stop: Optional[Sequence[str]] = None) -> str:
        pass
    
    def predict_messages(
        self, messages: List[BaseMessage], *, stop: Optional[Sequence[str]] = None
    ) -> BaseMessage:
        pass

    def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                        callbacks: Callbacks = None) -> LLMResult:

        device = torch.device(
            "cuda") if torch.cuda.is_available() else torch.device("cpu")

        PATH_TO_CONVERTED_TOKENIZER = os.path.join(
            shared_dir, "llama/13B_converted/")
        tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
              
        with torch.no_grad():
            prompt = prompts[0].text
            
            print("Tokenizing...")
            s = time.time()
            inputs = tokenizer(prompt, return_tensors="pt")
            e1 = time.time()
            print("Time to tokenize: ", time.strftime(
                '%H:%M:%S', time.gmtime(e1 - s)))
            
            max_length = len(inputs.input_ids) + 500
            
            print("Generating...")
            generate_ids = self.model.generate(input_ids=inputs.input_ids.to(
                device), max_new_tokens=max_length)  # max_length = max_new_tokens + prompt_length
            e2 = time.time()
            print("Time to generate: ", time.strftime(
                '%H:%M:%S', time.gmtime(e2 - e1)))
            
            print("Decoding...")
            text_result = tokenizer.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            e3 = time.time()
            print("Time to decode: ", time.strftime(
                '%H:%M:%S', time.gmtime(e3 - e2)))

        generation = Generation(text=text_result)
        result = LLMResult(generations=[[generation]])
        return result

    async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                               callbacks: Callbacks = None) -> LLMResult:
        pass


In [12]:
class InstructorEmbeddings(Embeddings):
   
    def __init__(self):       
        self.model = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-base", device="cuda")
       
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return self.model(text)

# Test the model

In [14]:
df = pd.read_csv(f'{shared_dir}/data/documents_with_professors.csv')

In [15]:
df.iloc[0]

text    The name of the course is Private Internationa...
Name: 0, dtype: object

In [17]:
loader = DataFrameLoader(df, ["text"])
# chunk size must be 512 because InstructorEmbeddings max_seq_length = 512
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)

#embeddings = DistilbertEmbeddings()
embeddings = InstructorEmbeddings()

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
documents = loader.load()
texts = text_splitter.split_documents(documents)

In [19]:
len(embeddings.embed_documents([texts[0].page_content])[0])

768

In [20]:
# this will create the chroma embedding database!!!

db = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [21]:
db.similarity_search_with_score("Who's Kairit Sirts?")

[(Document(page_content='Kairit Sirts teaches Didactic Practice, Natural Language Processing', metadata={}),
  0.14056000113487244),
 (Document(page_content='Piret Kibur teaches Pulmonary Medicine and Thoracic Surgery', metadata={}),
  0.24095147848129272),
 (Document(page_content='Kaire Piirsalu-Kivihall teaches Qualitative Research Methods', metadata={}),
  0.24388602375984192),
 (Document(page_content='Teet Kaur teaches Singing and Vocal Training IV, Main Instrument', metadata={}),
  0.24394838511943817)]

In [29]:
llm = LlamaWrapperModel()

Loading checkpoint shards: 100%|████████████████████████████████████████| 3/3 [00:12<00:00,  4.32s/it]


In [30]:
from langchain.prompts import PromptTemplate

prompt_template = """
Your job is to answer student questions about the university of Tartu. You have to answer based on some information given below.
If there is no relevant information, you should tell the student that you do not know the answer.
Do not reveal any details of how question answering process works. Do not mention the information given to you.
When possible, rephrase the answer so it follows the grammar rules, flows naturally and is easy to understand.
Only use the context from the paragraph that is relevant to the question the most. Only answer the question that is asked. Do not add any additional information.
 
{context}

Question: {question}
Helpful Answer:"""

llm_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}."
)

In [42]:
# CANNOT LOAD FROM DISK BECAUSE OF EMBD DIM INITIALIZATION BUG
# db = Chroma(persist_directory=f"{shared_dir}/data/chroma", embedding_function=embeddings)  # load from disk 
retriever = db.as_retriever(search_kwargs={"k": 3})
chain_type_kwargs = {"prompt": llm_prompt}

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)
qa.combine_documents_chain.document_prompt = chain_prompt

In [57]:
query = "What courses should I take to learn python?"
response = qa(query)

Tokenizing...
Time to tokenize:  00:00:00
Generating...
Time to generate:  00:00:10
Decoding...
Time to decode:  00:00:00


In [58]:
print(response['result'])


Your job is to answer student questions about the university of Tartu. You have to answer based on some information given below.
If there is no relevant information, you should tell the student that you do not know the answer.
Do not reveal any details of how question answering process works. Do not mention the information given to you.
When possible, rephrase the answer so it follows the grammar rules, flows naturally and is easy to understand.
Only use the context from the paragraph that is relevant to the question the most. Only answer the question that is asked. Do not add any additional information.
 
The name of the course is English for Intermediate Learners. Grammar and Vocabulary Course (100% Web-Based), Level B2.1 > B2.2. The code of it is HVLC.01.063.The purpose of the course is  The objective of the course is to develop students' grammar and vocabulary skills at B2 level. By the end of the course, students will have expanded and consolidated vocabulary, improved their know