In [1]:
#!rm -rf /gpfs/space/projects/stud_ml_22/NLP/data/chroma_distilbert/*

In [1]:
!ls /gpfs/space/projects/stud_ml_22/NLP/llama_langchain

data_exploration.py  main.py  requirements.txt


In [8]:
#!module load cuda/11.7.0

In [None]:
# !cd ../bitsandbytes
# !CUDA_VERSION=117 make cuda11x_nomatmul
# !cd ../tartu-nlp-courses-qa

In [1]:
!nvidia-smi

Mon May 22 01:38:53 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB            Off| 00000000:B2:00.0 Off |                    0 |
| N/A   32C    P0               55W / 300W|      0MiB / 32768MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
import numpy as np
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel
from accelerate import Accelerator
import time

from pydantic import BaseModel, Extra, Field, root_validator
from typing import Any, List, Optional, Dict, Sequence
from chromadb.utils import embedding_functions


In [3]:
torch.cuda.is_available()

True

In [4]:
shared_dir = '/gpfs/space/projects/stud_ml_22/NLP'
PATH_TO_CONVERTED_WEIGHTS = os.path.join(shared_dir, "llama/13B_Vicuna_added/")
PATH_TO_CONVERTED_TOKENIZER = os.path.join(shared_dir, "llama/13B_Vicuna_added/")

In [5]:
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import Callbacks
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
# from langchain.llms import OpenAI
from langchain.schema import Generation
from langchain.schema import PromptValue, LLMResult
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


# langchain.document_loaders.DataFrameLoader has a quite a limited functionality
class DataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, page_content_columns: List[str]):
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        self.data_frame = data_frame
        self.page_content_columns = page_content_columns

    def load(self) -> List[Document]:
        result = []
        for i, row in self.data_frame.iterrows():
            text = ""
            metadata = {}
            for col in self.page_content_columns:
                data = row[col]
                if isinstance(data, list):
                    text += "".join(data) + "\n"
                elif isinstance(data, str):
                    text += data + "\n"
                else:
                    print(f"[IGNORED] [{i}] [{col}] {data}")

            metadata_temp = row.to_dict()
            for col in self.page_content_columns:
                metadata_temp.pop(col)
            # Metadata is a dict where a value can only be str, int, or float. Delete other types.
            for key, value in metadata_temp.items():
                if isinstance(value, (str, int, float)):
                    metadata[key] = value

            result.append(Document(page_content=text, metadata=metadata))
        return result

In [42]:
def get_model(encoder_only = False):
        device = torch.device(
            "cuda") if torch.cuda.is_available() else torch.device("cpu")

        config = AutoConfig.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        config.max_position_embeddings = 2048

        model = AutoModelForCausalLM.from_pretrained(
            PATH_TO_CONVERTED_WEIGHTS,
            config=config,
            trust_remote_code=True,
            # use_cache=not args.no_gradient_checkpointing,
            load_in_8bit=True,
            device_map={"": Accelerator().process_index},
            # device_map="auto"
        )
        
        return model

In [43]:
from langchain.schema import BaseMessage, LLMResult, PromptValue, get_buffer_string

class LlamaWrapperModel(BaseLanguageModel):
    model: Any
    
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        values['model'] = get_model()
        # values['model'] = 'model'
        return values
    
    def predict(self, text: str, *, stop: Optional[Sequence[str]] = None) -> str:
        pass
    
    def predict_messages(
        self, messages: List[BaseMessage], *, stop: Optional[Sequence[str]] = None
    ) -> BaseMessage:
        pass

    def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                        callbacks: Callbacks = None) -> LLMResult:
        tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
              
        with torch.no_grad():
            prompt = prompts[0].text
            
            print("Tokenizing...")
            s = time.time()
            inputs = tokenizer(prompt, return_tensors="pt")
            e1 = time.time()
            print("Time to tokenize: ", time.strftime(
                '%H:%M:%S', time.gmtime(e1 - s)))
            
            print("Generating...")
            generate_ids = self.model.generate(input_ids=inputs.input_ids.to(
                device), max_length=3000)  # max_length = max_new_tokens + prompt_length
            e2 = time.time()
            print("Time to generate: ", time.strftime(
                '%H:%M:%S', time.gmtime(e2 - e1)))
            
            print("Decoding...")
            text_result = tokenizer.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            e3 = time.time()
            print("Time to decode: ", time.strftime(
                '%H:%M:%S', time.gmtime(e3 - e2)))

        generation = Generation(text=text_result)
        result = LLMResult(generations=[[generation]])
        return result

    async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                               callbacks: Callbacks = None) -> LLMResult:
        pass  # "whatever dude"


In [44]:
# NOTE: the OpenAIEmbeddings embeddings have the dimensionality of 1536

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

class DistilbertEmbeddings(Embeddings):
   
    def __init__(self):       
        self.model = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
       

        # return values
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Tokenize the messages and generate embeddings
        tokenized = [self.tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in texts]
        padded = np.array([i + [0]*(512-len(i)) for i in tokenized])
       
        input_ids = torch.tensor(padded)
        embds = []
        with torch.no_grad():
            for i in range(0, len(input_ids)-32, 32):
                batch = input_ids[i:i+32].to(device)
                last_hidden_states = self.model(batch)[0][:,0,:].cpu().numpy().tolist()
                embds.extend(last_hidden_states)
            last_batch = input_ids[(len(input_ids) // 32) * 32:].to(device)
            last_hidden_states = self.model(last_batch)[0][:,0,:].cpu().numpy().tolist()
            embds.extend(last_hidden_states)
            
        return embds

    def embed_query(self, text: str) -> List[float]:
        # Tokenize the messages and generate embeddings
        tokenized = self.tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        padded = np.array([tokenized+ [0]*(512-len(tokenized))])
        input_ids = torch.tensor(padded)
        with torch.no_grad():
            last_hidden_states = self.model(input_ids.to(device))
            last_hidden_states = last_hidden_states[0][:,0,:].cpu().numpy()
        
        return last_hidden_states[0].tolist()

In [45]:
class InstructorEmbeddings(Embeddings):
   
    def __init__(self):       
        self.model = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-base", device="cuda")
       
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return self.model(text)

# Test the model

In [9]:
#df = pd.read_pickle(f'{shared_dir}/data/course_info.pkl')

In [10]:
df = pd.read_csv(f'{shared_dir}/data/courses_info_all.csv')

In [11]:
q_bio = df[df['title_en'] == "Bioinformatics Seminar"]['all_course_info'].values[0]

In [24]:
loader = DataFrameLoader(df, ["title_en", "all_course_info"])
# chunk size must be 512 because InstructorEmbeddings max_seq_length = 512
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)

#embeddings = DistilbertEmbeddings()
embeddings = InstructorEmbeddings()

load INSTRUCTOR_Transformer
max_seq_length  512


In [25]:
documents = loader.load()
texts = text_splitter.split_documents(documents)

[IGNORED] [2853] [title_en] nan
[IGNORED] [2857] [title_en] nan


In [26]:
df.columns

Index(['title_en', 'all_course_info'], dtype='object')

In [27]:
texts[0].page_content

'Private International Law\nThe name of the course is Private International Law. The purpose of the course is  The purpose of the course is to teach the theoretical principles of private international law (PIL); to explain the key-notions of PIL and the development of its main institutes; to give an understanding of the main fields and terminology of PIL; to teach students to use the instruments of PIL to determine the international jurisdiction of courts and the applicable law to a given dispute, as well as to assess the possibilities of recognizing and enforcing foreign judgments in Estonia. At the end of the course the student is able to:The course covers the main topics of private international law (PIL), its terminology and development. The instruments which govern the determination of international jurisdiction and the applicable law in Estonia, as well as their most relevant norms, will be covered with the aim of teaching students to use them in practice (the Hague conventions, 

In [30]:
len(embeddings.embed_documents([texts[0].page_content])[0])

768

In [31]:
# this will create the chroma embedding database!!!

db = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [34]:
db.similarity_search_with_score("NLP")

[(Document(page_content='Seminar on Natural Language Processing\nThe name of the course is Seminar on Natural Language Processing. The purpose of the course is  The student will get a quick and painful hands-on acquaintance and experience with one of actual problems of NLP by participating in one of the international competitions on machine translation, dependency parsing, semantic processing, etc.2018-2019 Fall  seminar\'s topic will be about Natural Language Processing and Wikipedia. As we all know, the advancement in NLP relies on good quality data.  In the last years, Wikipedia, the largest collaborative online encyclopedia, is used more and more for major NLP tasks. Wikipedia is not simply a corpus but it has a semantic structure that can be exploited in various ways (e.g. it each Wikipedia page is tagged with a set of categories, homonyms take us to a disambiguation page, Wikipedia contains infoboxes that can be used in supervised learning etc ...) In the seminar, we will explore

In [16]:
#!/gpfs/space/home/zaliznyi/miniconda3/envs/nlp/bin/python3.10 -m pip install bitsandbytes
#!/gpfs/space/home/zaliznyi/miniconda3/envs/nlp/bin/python3.10 -m pip uninstall tokenizers

In [46]:
llm = LlamaWrapperModel()

Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.45s/it]


In [47]:
from langchain.prompts import PromptTemplate

prompt_template = """Your task is to provide students with information about courses offered in University of Tartu. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.


{context}

Question: {question}
Helpful Answer:"""

llm_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}."
)

In [48]:
# CANNOT LOAD FROM DISK BECAUSE OF EMBD DIM INITIALIZATION BUG
# db = Chroma(persist_directory=f"{shared_dir}/data/chroma", embedding_function=embeddings)  # load from disk 
retriever = db.as_retriever()
retriever.search_kwargs = {'k': 3}
chain_type_kwargs = {"prompt": llm_prompt}

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)
qa.combine_documents_chain.document_prompt = chain_prompt

In [49]:
query = "What is the purpose of Bioinformatics Seminar?"
print(qa(query))

Tokenizing...
Time to tokenize:  00:00:00
Generating...
Time to generate:  00:13:09
Decoding...
Time to decode:  00:00:00
{'query': 'What is the purpose of Bioinformatics Seminar?', 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nBioinformatics Seminar\nThe name of the course is Bioinformatics Seminar. The purpose of the course is  To get acquainted with modern bioinformatics research publications and presentation of the results therein Upon successful completion of this course, students should be able to:The seminar is organized in the form of a Journal Club. Scientific results are presented and discussed. The topics covered will vary from semester to semester. Please see the seminar website on http://courses.cs.ut.ee for more details.Language of instruction is Estonian. The course is offered by Chair of Data Science. The course is taught in 2023/2024 yea