In [1]:
!rm -rf /gpfs/space/projects/stud_ml_22/NLP/data/chroma_distilbert/*

In [2]:
!ls /gpfs/space/projects/stud_ml_22/NLP/llama_langchain

data_exploration.py  main.py  requirements.txt


In [3]:
import torch
import numpy as np
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel
from accelerate import Accelerator
import time

from pydantic import BaseModel, Extra, Field, root_validator
from typing import Any, List, Optional, Dict, Sequence


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.cuda.is_available()

False

In [5]:
shared_dir = '/gpfs/space/projects/stud_ml_22/NLP'

In [6]:
# https://python.langchain.com/en/latest/getting_started/getting_started.html
# https://python.langchain.com/en/latest/use_cases/question_answering.html
# https://python.langchain.com/en/latest/modules/indexes/getting_started.html
# https://python.langchain.com/en/latest/use_cases/question_answering/semantic-search-over-chat.html

from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import Callbacks
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
# from langchain.llms import OpenAI
from langchain.schema import Generation
from langchain.schema import PromptValue, LLMResult
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


# langchain.document_loaders.DataFrameLoader has a quite a limited functionality
class DataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, page_content_columns: List[str]):
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        self.data_frame = data_frame
        self.page_content_columns = page_content_columns

    def load(self) -> List[Document]:
        result = []
        for i, row in self.data_frame.iterrows():
            text = ""
            metadata = {}
            for col in self.page_content_columns:
                data = row[col]
                if isinstance(data, list):
                    text += "".join(data) + "\n"
                elif isinstance(data, str):
                    text += data + "\n"
                else:
                    print(f"[IGNORED] [{i}] [{col}] {data}")

            metadata_temp = row.to_dict()
            for col in self.page_content_columns:
                metadata_temp.pop(col)
            # Metadata is a dict where a value can only be str, int, or float. Delete other types.
            for key, value in metadata_temp.items():
                if isinstance(value, (str, int, float)):
                    metadata[key] = value

            result.append(Document(page_content=text, metadata=metadata))
        return result


# class MyLanguageModel(BaseLanguageModel):
#     def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
#                         callbacks: Callbacks = None) -> LLMResult:
#         generation = Generation(text="Hello World!")
#         result = LLMResult(generations=[[generation]])
#         return result

#     async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
#                                callbacks: Callbacks = None) -> LLMResult:
#         pass  # "whatever dude"


# # NOTE: the OpenAIEmbeddings embeddings have the dimensionality of 1536
# class MyEmbeddings(Embeddings):
#     def embed_documents(self, texts: List[str]) -> List[List[float]]:
#         return [[1.0] * 1536, [2.0] * 1536]

#     def embed_query(self, text: str) -> List[float]:
#         return [1.0] * 1536


Could not import azure.core python package.


In [7]:
def get_model(encoder_only = False):
        PATH_TO_CONVERTED_WEIGHTS = os.path.join(
            shared_dir, "llama/7B_Vicuna_added/")

        device = torch.device(
            "cuda") if torch.cuda.is_available() else torch.device("cpu")

        config = AutoConfig.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        config.max_position_embeddings = 1024

        model = AutoModelForCausalLM.from_pretrained(
            PATH_TO_CONVERTED_WEIGHTS,
            config=config,
            trust_remote_code=True,
            # use_cache=not args.no_gradient_checkpointing,
            load_in_8bit=True,
            device_map={"": Accelerator().process_index},
            # device_map="auto"
        )
        
        return model

In [8]:
from langchain.schema import BaseMessage, LLMResult, PromptValue, get_buffer_string

class LlamaWrapperModel(BaseLanguageModel):
    model: Any
    
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        values['model'] = get_model()
        # values['model'] = 'model'
        return values
    
    def predict(self, text: str, *, stop: Optional[Sequence[str]] = None) -> str:
        pass
    
    def predict_messages(
        self, messages: List[BaseMessage], *, stop: Optional[Sequence[str]] = None
    ) -> BaseMessage:
        pass

    def generate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                        callbacks: Callbacks = None) -> LLMResult:

        PATH_TO_CONVERTED_TOKENIZER = os.path.join(
            shared_dir, "llama/7B_converted/")
        tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
              
        with torch.no_grad():
            prompt = prompts[0].text
            
            print("Tokenizing...")
            s = time.time()
            inputs = tokenizer(prompt, return_tensors="pt")
            e1 = time.time()
            print("Time to tokenize: ", time.strftime(
                '%H:%M:%S', time.gmtime(e1 - s)))
            
            print("Generating...")
            generate_ids = self.model.generate(input_ids=inputs.input_ids.to(
                device), max_length=5000)  # max_length = max_new_tokens + prompt_length
            e2 = time.time()
            print("Time to generate: ", time.strftime(
                '%H:%M:%S', time.gmtime(e2 - e1)))
            
            print("Decoding...")
            text_result = tokenizer.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            e3 = time.time()
            print("Time to decode: ", time.strftime(
                '%H:%M:%S', time.gmtime(e3 - e2)))

        generation = Generation(text=text_result)
        result = LLMResult(generations=[[generation]])
        return result

    async def agenerate_prompt(self, prompts: List[PromptValue], stop: Optional[List[str]] = None,
                               callbacks: Callbacks = None) -> LLMResult:
        pass  # "whatever dude"


In [9]:
# NOTE: the OpenAIEmbeddings embeddings have the dimensionality of 1536

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

class DistilbertEmbeddings(Embeddings):
   
    def __init__(self):       
        self.model = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
       

        # return values
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Tokenize the messages and generate embeddings
        tokenized = [self.tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in texts]
        padded = np.array([i + [0]*(512-len(i)) for i in tokenized])
       
        input_ids = torch.tensor(padded)
        embds = []
        with torch.no_grad():
            for i in range(0, len(input_ids)-32, 32):
                batch = input_ids[i:i+32].to(device)
                last_hidden_states = self.model(batch)[0][:,0,:].cpu().numpy().tolist()
                embds.extend(last_hidden_states)
            last_batch = input_ids[(len(input_ids) // 32) * 32:].to(device)
            last_hidden_states = self.model(last_batch)[0][:,0,:].cpu().numpy().tolist()
            embds.extend(last_hidden_states)
            
        return embds

    def embed_query(self, text: str) -> List[float]:
        # Tokenize the messages and generate embeddings
        tokenized = self.tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        padded = np.array([tokenized+ [0]*(512-len(tokenized))])
        input_ids = torch.tensor(padded)
        with torch.no_grad():
            last_hidden_states = self.model(input_ids.to(device))
            last_hidden_states = last_hidden_states[0][:,0,:].cpu().numpy()
        
        return last_hidden_states[0].tolist()

# Test the model

In [10]:
#df = pd.read_pickle(f'{shared_dir}/data/course_info.pkl')

In [11]:
df = pd.read_csv(f'{shared_dir}/data/courses_info_all.csv')

In [12]:
q_bio = df[df['title_en'] == "Bioinformatics Seminar"]['all_course_info'].values[0]

In [13]:
loader = DataFrameLoader(df, ["title_en"])
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

embeddings = DistilbertEmbeddings()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
documents = loader.load()
texts = text_splitter.split_documents(documents)

[IGNORED] [2853] [title_en] nan
[IGNORED] [2857] [title_en] nan


In [15]:
# this will create the chroma embedding database!!!

db = Chroma.from_documents(texts, embeddings, persist_directory=f"{shared_dir}/data/chroma_distilbert")

Using embedded DuckDB with persistence: data will be stored in: /gpfs/space/projects/stud_ml_22/NLP/data/chroma_distilbert


768
2


In [16]:
# db.similarity_search_with_score("Special Seminar in Machine Learning")

In [17]:
llm = LlamaWrapperModel()

In [32]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""

llm_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_prompt = PromptTemplate(
    input_variables=["page_content", "all_course_info"], template="{page_content}. {all_course_info}"
)

In [30]:
# CANNOT LOAD FROM DISK BECAUSE OF EMBD DIM INITIALIZATION BUG
# db = Chroma(persist_directory=f"{shared_dir}/data/chroma", embedding_function=embeddings)  # load from disk 
retriever = db.as_retriever()
chain_type_kwargs = {"prompt": llm_prompt}

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)
qa.combine_documents_chain.document_prompt = chain_prompt

In [None]:
query = "What is the purpose of Bioinformatics Seminar?"
print(qa(query))