# Generate Embeddings for text
Author: Cristian Velandia

Creation Date: 2024-03-03

Create embeddings for the vector DB using Metas Llama cpp quntize embeddings model

In [1]:
from langchain.embeddings import LlamaCppEmbeddings
from llama_cpp import Llama
import pandas as pd
from huggingface_hub import hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm


### Load Dataset

In [2]:
data_path = r"D:\Documents\GitHub\knowledge_pal_assistant\2_outputs\chunks.parquet"
chunks = pd.read_parquet(data_path, engine = "pyarrow")

In [3]:
chunks.head()

Unnamed: 0,id,page_content,metadata,tokens,nostopw_page_content
0,0-0,# AWS::Events::Rule SageMakerPipelineParameter...,{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, AWS, :, :Events, :, :Rule, SageMakerPipeli...",# AWS : :Events : :Rule SageMakerPipelineParam...
1,0-1,"## Syntax<a name=""aws-properties-events-rule-s...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Syntax, <, a, name=, '', aws-properties...",# # Syntax < name= '' aws-properties-events-ru...
2,0-2,"## Properties<a name=""aws-properties-events-ru...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Properties, <, a, name=, '', aws-proper...",# # Properties < name= '' aws-properties-event...
3,1-0,# Automating Amazon SageMaker with Amazon Even...,{'Header 1': 'Automating Amazon SageMaker with...,"[#, Automating, Amazon, SageMaker, with, Amazo...",# Automating Amazon SageMaker Amazon EventBrid...
4,1-1,"## Training job state change<a name=""eventbrid...",{'Header 1': 'Automating Amazon SageMaker with...,"[#, #, Training, job, state, change, <, a, nam...",# # Training job state change < name= '' event...


In [4]:
chunks.shape

(1046, 5)

In [5]:
chunks["token_length"] = chunks["tokens"].apply(lambda x: len(x))
chunks.reset_index(drop = False, inplace = True)

In [6]:
chunks["token_length"].value_counts().sort_index(ascending = False)

token_length
5169     1
2282     1
2027     1
2007     1
1921     1
        ..
31      17
30      10
29      14
28       1
27       1
Name: count, Length: 373, dtype: int64

### Setup LLama Embeddings model

In [7]:
models_folder = ".\\hf_models"
model_name = "TheBloke/Llama-2-7B-Chat-GGML"
model_filename = "llama-2-7b-chat.ggmlv3.q4_K_M.bin"

In [None]:
#Download the model if necessary
file_path = hf_hub_download(repo_id = model_name, filename = model_filename, local_dir = models_folder,)

In [8]:
llama_model_path = f"{models_folder}\\{model_filename}"
embeddings = LlamaCppEmbeddings(model_path = llama_model_path, n_ctx = 4096, n_gpu_layers = 5) # Instance model with GPU n_gpu_layers = 30 works slower than just processor, n_gpu_layers = 5 optimal 

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | 


### Generate Embeddings

In [None]:
documents = chunks[chunks["token_length"] <= 4096]["page_content"].to_list()
len(documents)

In [None]:
test_string = documents[11]
embs = embeddings.embed_query(test_string)

In [None]:
len(embs)

In [9]:
save_folder = "D:\\Documents\GitHub\\knowledge_pal_assistant\\2_outputs\\"
corpus_size = chunks.shape[0]
batch_size = 50
counter = 1
dmb_df = pd.DataFrame()

while corpus_size > 0:
    print(corpus_size)

    batch = chunks[(chunks["index"] >= batch_size*(counter-1)) & (chunks["index"] < batch_size*counter)][["id", "page_content"]]
    documents = batch["page_content"].to_list()

    # Embed texts
    aws_embeddings = embeddings.embed_documents(documents)

    # Checkpoint
    batch_ids = batch["id"].tolist()
    dmb_df = pd.concat([dmb_df, pd.DataFrame.from_dict(dict(zip(batch_ids, [[e] for e in aws_embeddings])), orient='index', columns=["vect"])])
    vectorized_chunks = chunks.merge(dmb_df.reset_index(drop=False), how="inner", left_on="id", right_on="index")
    vectorized_chunks.to_parquet(save_folder + "vectors_llama.parquet", index = False, engine = "pyarrow", compression= "brotli")

    # Control Parameters
    corpus_size -= batch.shape[0]
    counter += 1   

1046
996
946
896
846
796
746
696
646
596
546
496
446
396
346
296
246


ValueError: could not broadcast input array from shape (8,) into shape (0,)

In [11]:
vectorized_chunks.shape

(800, 9)