In [1]:
!ls /gpfs/space/projects/stud_ml_22/NLP

data  experiments  llama  llama_langchain  tartu-nlp-courses-qa


In [2]:
!ls /gpfs/space/projects/stud_ml_22/NLP/llama_langchain

data_exploration.py  main.py  requirements.txt


In [56]:
# https://python.langchain.com/en/latest/getting_started/getting_started.html
# https://python.langchain.com/en/latest/use_cases/question_answering.html
# https://python.langchain.com/en/latest/modules/indexes/getting_started.html
# https://python.langchain.com/en/latest/use_cases/question_answering/semantic-search-over-chat.html

from typing import Any, List, Optional

import pandas as pd
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import Callbacks
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.llms import OpenAI
from langchain.schema import Generation
from langchain.schema import PromptValue, LLMResult
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


# langchain.document_loaders.DataFrameLoader has a quite a limited functionality
class DataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, page_content_columns: List[str]):
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        self.data_frame = data_frame
        self.page_content_columns = page_content_columns

    def load(self) -> List[Document]:
        result = []
        for i, row in self.data_frame.iterrows():
            text = ""
            metadata = {}
            for col in self.page_content_columns:
                data = row[col]
                if isinstance(data, list):
                    text += "".join(data) + "\n"
                elif isinstance(data, str):
                    text += data + "\n"
                else:
                    print(f"[IGNORED] [{i}] [{col}] {data}")

            metadata_temp = row.to_dict()
            for col in self.page_content_columns:
                metadata_temp.pop(col)
            # Metadata is a dict where a value can only be str, int, or float. Delete other types.
            for key, value in metadata_temp.items():
                if isinstance(value, (str, int, float)):
                    metadata[key] = value

            result.append(Document(page_content=text, metadata=metadata))
        return result


class MyLanguageModel(BaseLanguageModel):
    def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                        callbacks: Callbacks = None) -> LLMResult:
        generation = Generation(text="Hello World!")
        result = LLMResult(generations=[[generation]])
        return result

    async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                               callbacks: Callbacks = None) -> LLMResult:
        pass  # "whatever dude"


# NOTE: the OpenAIEmbeddings embeddings have the dimensionality of 1536
class MyEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [[1.0] * 1536, [2.0] * 1536]

    def embed_query(self, text: str) -> List[float]:
        return [1.0] * 1536


In [57]:
shared_dir = '/gpfs/space/projects/stud_ml_22/NLP'

In [58]:
df = pd.read_pickle(f'{shared_dir}/data/course_info.pkl')
df.head()

Unnamed: 0,uuid,code,parent_uuid,parent_code,parent_credits,title_en,general_input_languages,general_structural_unit_shares,general_year.en,general_type.code,...,resources_mandatory_materials,resources_recommended_materials,resources_learning_environments,participants_lecturers,participants_assistants,schedule_entries,schedule_weeks.et,registration_info_min_students,registration_info_max_students,registration_info_audience.en
0,a198ed66-1fb5-4f7e-ee43-7fbbf5c09aca,sv-2023-spring-openuniv,b99c0bb1-efd4-9b0a-857a-3dc7114e5c19,OIEO.06.046,6.0,Private International Law,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'SVOI04', 'name': 'Department of Pri...",2023/2024,regular,...,,,,[{'person_uuid': 'd7a3f19b-d7c7-fbe5-b41b-e5e3...,,"[{'course_week': 1, 'work_type': {'code': 'lec...",Nädalad,1.0,60.0,
1,6ee943ab-a839-a937-76c0-2e0e4daedb8b,,76162416-d608-f48f-ec5d-5c40ce9b320d,FLFI.00.016,15.0,Doctoral Seminar,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'HVFI01', 'name': 'Department of Phi...",2023/2024,regular,...,Presentations.,Ask the supervisor.,,[{'person_uuid': '4b6a00ae-35fd-a00e-d38e-0e2f...,,"[{'work_type': {'code': 'colloquium', 'et': 'k...",24-40,1.0,,
2,7fc4f6cf-f011-91e8-4c42-abbd782a4a2a,,31c327d5-2b61-c764-b418-bda22c577265,SVNC.00.179,4.0,Pedagogical Practicum,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'SVNC', 'name': 'Narva College', 'co...",2023/2024,practice,...,Põhikooli- ja gümnaasiumi Riiklik õppekava htt...,,,[{'person_uuid': 'e664d700-4a63-b159-794e-d0be...,,"[{'work_type': {'code': 'practice', 'et': 'pra...",24-43,15.0,46.0,
3,d72f2ef7-d264-eceb-b759-a9a66cc27593,,0e7d0b5d-83ea-f260-7e09-c3d59ea9c250,KKSB.05.092,3.0,Practice in the Work Environment,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'MVSF', 'name': 'Institute of Sport ...",2022/2023,regular,...,,,,[{'person_uuid': '1ff846ac-79c1-ef64-3910-5131...,,"[{'time': '2023-06-20', 'work_type': {'code': ...",40-52,1.0,80.0,
4,2a69334b-ebec-b332-d5f2-984869620c04,,f465e112-e552-a3d1-5fa6-e26e661b288b,MTAT.03.242,12.0,Bioinformatics Seminar,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'LTAT02', 'name': 'Chair of Data Sci...",2023/2024,regular,...,The readings for every topic could be found on...,The readings for every topic could be found on...,,[{'person_uuid': '643ca845-067b-f270-23fd-dafe...,,"[{'work_type': {'code': 'seminar', 'et': 'semi...",24.-39. õppenädalal,5.0,,


In [52]:
def get_model(encoder_only = False):
        PATH_TO_CONVERTED_WEIGHTS = os.path.join(
            shared_dir, "llama/7B_Vicuna_added/")

        device = torch.device(
            "cuda") if torch.cuda.is_available() else torch.device("cpu")

        config = AutoConfig.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        config.max_position_embeddings = 1024

        model = AutoModelForCausalLM.from_pretrained(
            PATH_TO_CONVERTED_WEIGHTS,
            config=config,
            trust_remote_code=True,
            # use_cache=not args.no_gradient_checkpointing,
            load_in_8bit=True,
            device_map={"": Accelerator().process_index},
            # device_map="auto"
        )
        
        return model

In [53]:
model = get_model()
print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.38s/it]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSN




In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import Accelerator
import time

import os

class LlamaWrapperModel(BaseLanguageModel):

    def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                        callbacks: Callbacks = None) -> LLMResult:

        model = get_model()
        PATH_TO_CONVERTED_TOKENIZER = os.path.join(
            shared_dir, "llama/7B_converted/")
        tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
              
        with torch.no_grad():
            prompt = prompts[0].text
            
            print("Tokenizing...")
            s = time.time()
            inputs = tokenizer(prompt, return_tensors="pt")
            e1 = time.time()
            print("Time to tokenize: ", time.strftime(
                '%H:%M:%S', time.gmtime(e1 - s)))
            
            print("Generating...")
            generate_ids = model.generate(input_ids=inputs.input_ids.to(
                device), max_length=5000)  # max_length = max_new_tokens + prompt_length
            e2 = time.time()
            print("Time to generate: ", time.strftime(
                '%H:%M:%S', time.gmtime(e2 - e1)))
            
            print("Decoding...")
            text_result = tokenizer.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            e3 = time.time()
            print("Time to decode: ", time.strftime(
                '%H:%M:%S', time.gmtime(e3 - e2)))

        generation = Generation(text=text_result)
        result = LLMResult(generations=[[generation]])
        return result

    async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                               callbacks: Callbacks = None) -> LLMResult:
        pass  # "whatever dude"


In [None]:
# NOTE: the OpenAIEmbeddings embeddings have the dimensionality of 1536
class DistilbertEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        model = AutoModel.from_pretrained('distilbert-base-uncased')

        # Tokenize the messages and generate embeddings
        tokenized = [tokenizer.encode(text, add_special_tokens=True, max_length=1024, truncation=True) for text in texts]
        padded = np.array([i + [0]*(512-len(i)) for i in tokenized.values])
        input_ids = torch.tensor(padded)
        with torch.no_grad():
            last_hidden_states = model(input_ids)[0][:,0,:].numpy()
        
        return last_hidden_states

    def embed_query(self, text: str) -> List[float]:
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        model = AutoModel.from_pretrained('distilbert-base-uncased')

        # Tokenize the messages and generate embeddings
        tokenized = tokenizer.encode(text, add_special_tokens=True, max_length=1024, truncation=True)
        padded = np.array([i + [0]*(512-len(i)) for i in tokenized.values])
        input_ids = torch.tensor(padded)
        with torch.no_grad():
            last_hidden_states = model(input_ids)[0][:,0,:].numpy()
        
        return last_hidden_states[0]

In [50]:
!ls /gpfs/space/projects/stud_ml_22/NLP/data/chroma

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
chroma-collections.parquet  chroma-embeddings.parquet  index


In [42]:
print(f"{shared_dir}/data/chroma")

/gpfs/space/projects/stud_ml_22/NLP/data/chroma


In [43]:
loader = DataFrameLoader(df, ["title_en", "overview_objectives",
                              "overview_learning_outcomes", "overview_description.en"])
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# embeddings = OpenAIEmbeddings()
embeddings = DistilbertEmbeddings()

# this will create the chroma embedding database!!!
documents = loader.load()
texts = text_splitter.split_documents(documents)
db = Chroma.from_documents(texts, embeddings, persist_directory=f"{shared_dir}/data/chroma_distilbert")

In [44]:
# llm = OpenAI(temperature=0.9)
# llm = MyLanguageModel()
llm = LlamaWrapperModel()

db = Chroma(persist_directory=f"{shared_dir}/data/chroma", embedding_function=embeddings)  # load from disk
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [13]:
query = "What's Jaak Vilo's last name?"
print(qa.run(query))

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.74s/it]


initialized model: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_la