In [None]:
# !pip install transformers accelerate bitsandbytes sagemaker openai langchain milvus openai xformers pymilvus chromadb==0.5.3 pydantic==1.10.8 sentence_transformers tiktoken fitz frontend tools

In [3]:
import os
import requests
import json
import openai
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
import torch
import fitz
# import boto3
# import sagemaker
import re
import chromadb
# from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from pymilvus import FieldSchema, MilvusClient,CollectionSchema, DataType, utility,Collection
import tiktoken
from transformers import  AutoTokenizer,AutoModel
from sentence_transformers import SentenceTransformer


In [4]:
def extract_post_comments_from_the_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    posts_pattern = r"Title of the post:(.*?)(?=Title of the post:|\Z)"
    posts = re.findall(posts_pattern, text, re.DOTALL)
    posts_dict = {}
    for post_value in tqdm(posts):
        title_to_body_pattern = r"(.*?)(?=Comment of the post: |\Z)"
        title_to_body = re.findall(title_to_body_pattern, post_value, re.DOTALL)
        try:
            title_to_body = title_to_body[0]
        except Exception as e:
            import pdb;pdb.set_trace()
        list_of_comments = []
        comments_to_title_pattern = r"Comment of the post:(.*?)(?=Title of the post:|\Z)"
        comments = re.findall(comments_to_title_pattern, post_value, re.DOTALL)
        if len(comments) > 0:
            individual_comments_pattern = r"New Comment: (.*?)(?=New Comment: |\Z)"
            individual_comments = re.findall(individual_comments_pattern,comments[0], re.DOTALL)
        list_of_comments = [i for i in individual_comments if len(i) > 0]
        if title_to_body in posts_dict.keys():
            import pdb;pdb.set_trace()
        posts_dict[title_to_body] = list_of_comments
    return posts_dict
with open("post_comment_mapping_final.json","r") as f:
    post_comment_mapping  =json.load(f)
post_id_to_posts_text_mapping = {}
post_ids_to_comment_ids_mapping = {}
comments_ids_to_comment_text_mappings = {}
post_counter = 0
comment_counter = 0
for item,value in post_comment_mapping.items():
    post_id_to_posts_text_mapping[post_counter] = item
    comment_ids_list = []
    for comment_value in value:
        comments_ids_to_comment_text_mappings[comment_counter] = comment_value
        comment_ids_list.append(comment_counter)
        comment_counter = comment_counter + 1
    post_ids_to_comment_ids_mapping[post_counter] = comment_ids_list
    post_counter = post_counter + 1
with open("post_id_to_posts_text_mapping.json", "w") as f:
    json.dump(post_id_to_posts_text_mapping,f)
with open("post_ids_to_comment_ids_mapping.json", "w") as f:
    json.dump(post_ids_to_comment_ids_mapping,f)
with open("comments_ids_to_comment_text_mappings.json", "w") as f:
    json.dump(comments_ids_to_comment_text_mappings,f)

In [5]:
def embed_data_using_open_ai_model(tokenizer,text):
    tokens = tokenizer.encode(text)
    if len(tokens) > 8191:
        tokens = tokens[:8191]
    truncated_text = tokenizer.decode(tokens)
    # openai.api_key =
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=truncated_text,
    )
    embedding = response.data[0].embedding
    return embedding

def embedding_main(embedding_model_type,text,query_embedding_bool):
    if embedding_model_type!="gpt":
        vector_dim = 1024
        vector_linear_directory = f"2_Dense_{vector_dim}"
        model_dir = "dunzhang/stella_en_400M_v5"
        model = AutoModel.from_pretrained(model_dir, trust_remote_code=True,use_memory_efficient_attention=False,unpad_inputs=False).cpu().eval()
        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        vector_linear = torch.nn.Linear(in_features=model.config.hidden_size, out_features=vector_dim)
        vector_linear_dict = {
            k.replace("linear.", ""): v for k, v in
            torch.load("pytorch_model.bin",map_location=torch.device('cpu')).items()
        }
        vector_linear.load_state_dict(vector_linear_dict)
        vector_linear.cpu()
        embedding = embed_data_using_stella_model(tokenizer,model,text,query_embedding_bool,vector_linear)
    else:
        model_name='text-embedding-ada-002'
        tokenizer = tiktoken.encoding_for_model(model_name)
        embedding = embed_data_using_open_ai_model(tokenizer,text)
        # embedding_size = 1536
        # print (embedding.shape)
    return embedding



In [6]:
def store_data_into_vector_store_chromadb(embedding_method,post_embeddings_list,post_ids_list,comment_embeddings_list,comment_ids_list):
    chroma_client = chromadb.PersistentClient(f'./chroma_sagemaker_{str(embedding_method)}')
    existing_collections = chroma_client.list_collections()
    names_collections = [i.name for i in existing_collections]
    if ("posts" in names_collections):
        posts_collection = chroma_client.get_collection(name="posts")
        chroma_client.delete_collection(name='posts')
    if ("comments" in names_collections):
        comments_collection = chroma_client.get_collection(name="comments")
        chroma_client.delete_collection(name='comments')
    posts_collection = chroma_client.create_collection(name="posts")
    comments_collection = chroma_client.create_collection(name="comments")
    posts_collection.add(embeddings=post_embeddings_list, ids=post_ids_list)
    comments_collection.add(embeddings=comment_embeddings_list, ids=comment_ids_list)
    # data_to_export = {
    # "post_ids": post_ids_list,
    # "post_embeddings": post_embeddings_list,
    # "comment_ids": comment_ids_list,
    # "comment_embeddings": comment_embeddings_list
    # }
    # with open(f'{embedding_method}_chromadb_embeddings_export.json', 'w') as f:
    #     json.dump(data_to_export, f)

def store_data_into_vector_store_milvus(embedding_type,post_embeddings_list,post_ids_list,comment_embeddings_list,comment_ids_list,VECTOR_INDEX_METHOD,EMBEDDING_SIZE):
    # connections.connect("default", host="localhost", port="19530")
    if embedding_type != "gpt":
        embedding_type = "stella"
    client = MilvusClient(f"./{embedding_type}_milvus_demo_{VECTOR_INDEX_METHOD}.db")
    posts_schema = client.create_schema(auto_id=False,enable_dynamic_field=False)
    posts_schema.add_field(field_name="post_id_field", datatype=DataType.INT64, is_primary=True)
    posts_schema.add_field(field_name="post_vector_field", datatype=DataType.FLOAT_VECTOR, dim=EMBEDDING_SIZE)
    comments_schema = client.create_schema(auto_id=False,enable_dynamic_field=False)
    comments_schema.add_field(field_name="comment_id_field", datatype=DataType.INT64, is_primary=True)
    comments_schema.add_field(field_name="comment_vector_field", datatype=DataType.FLOAT_VECTOR, dim=EMBEDDING_SIZE)
    post_collection = client.create_collection(collection_name="posts_collection", schema=posts_schema)
    comment_collection = client.create_collection(collection_name="comments_collection", schema=comments_schema)
    posts_index_params = client.prepare_index_params(collection_name="posts_collection")
    comments_index_params = client.prepare_index_params(collection_name="comments_collection")
    posts_index_params.add_index(field_name='post_vector_field', metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"M": 16, "efConstruction": 200})
    if VECTOR_INDEX_METHOD == "HNSW":
        # index_params = {"metric_type": "COSINE", "index_type": VECTOR_INDEX_METHOD, "params": {"M": 16, "efConstruction": 200}}
        posts_index_params.add_index(field_name='post_vector_field', metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"M": 16, "efConstruction": 200})
        comments_index_params.add_index(field_name='comment_vector_field', metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"M": 16, "efConstruction": 200})
    elif VECTOR_INDEX_METHOD == "IVF_FLAT":
        # index_params = {"metric_type": "COSINE", "index_type": VECTOR_INDEX_METHOD, "params": {"nlist": 128}}
        posts_index_params.add_index(field_name='post_vector_field',metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"nlist": 128})
        comments_index_params.add_index(field_name='comment_vector_field',metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"nlist": 128})
    elif VECTOR_INDEX_METHOD == "ANNOY":
        # index_params = {"metric_type": "COSINE", "index_type": VECTOR_INDEX_METHOD, "params": {"n_trees": 50}}
        posts_index_params.add_index(field_name='post_vector_field',metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"n_trees": 50})
        comments_index_params.add_index(field_name='comment_vector_field',metric_type="COSINE",index_type=str(VECTOR_INDEX_METHOD),params={"n_trees": 50})
    elif VECTOR_INDEX_METHOD =="BIN_FLAT":
        posts_index_params.add_index(field_name='post_vector_field', metric_type="COSINE", index_type=str(VECTOR_INDEX_METHOD), params={})
        comments_index_params.add_index(field_name='comment_vector_field', metric_type="COSINE", index_type=str(VECTOR_INDEX_METHOD), params={})
    elif VECTOR_INDEX_METHOD =="FLAT":
        posts_index_params.add_index(field_name='post_vector_field', metric_type="COSINE", index_type=str(VECTOR_INDEX_METHOD), params={})
        comments_index_params.add_index(field_name='comment_vector_field', metric_type="COSINE", index_type=str(VECTOR_INDEX_METHOD), params={})
    else:
        import pdb;pdb.set_trace()
    client.create_index(collection_name="posts_collection", index_params=posts_index_params)
    client.create_index(collection_name="comments_collection", index_params=comments_index_params)
    client.load_collection(collection_name="posts_collection")
    client.load_collection(collection_name="comments_collection")
    print (len(post_ids_list))
    print (len(post_embeddings_list))
    post_ids_list = [int(i) for i in post_ids_list]
    comment_ids_list = [int(i) for i in comment_ids_list]

    print (type(post_ids_list[0]))
    print (type(post_embeddings_list[0]))
    # Check if each embedding is correctly formatted
    for embedding in post_embeddings_list:
        if len(embedding) != EMBEDDING_SIZE:
            print(f"Embedding of length {len(embedding)} found, expected {EMBEDDING_SIZE}")
            raise ValueError("Incorrect embedding size")

    # Ensure all elements are floats
    for embedding in post_embeddings_list:
        if not all(isinstance(x, float) for x in embedding):
            raise ValueError("Embedding contains non-float values")
    data_posts_insertion = []
    data_comments_insertion = []
    for index_value in range(len(post_ids_list)):
        internal_dict = {}
        internal_dict['post_id_field'] = post_ids_list[index_value]
        internal_dict['post_vector_field'] = post_embeddings_list[index_value]
        data_posts_insertion.append(internal_dict)
    for index_value in range(len(comment_ids_list)):
        internal_dict = {}
        internal_dict['comment_id_field'] = comment_ids_list[index_value]
        internal_dict['comment_vector_field'] = comment_embeddings_list[index_value]
        data_comments_insertion.append(internal_dict)
    client.insert("posts_collection",data=data_posts_insertion)
    client.insert("comments_collection",data=data_comments_insertion)

    data_to_export = {
    "post_ids": post_ids_list,
    "post_embeddings": post_embeddings_list,
    "comment_ids": comment_ids_list,
    "comment_embeddings": comment_embeddings_list
    }
    with open(f'{embedding_type}_milvus_embeddings_export_{VECTOR_INDEX_METHOD}.json', 'w') as f:
        json.dump(data_to_export, f)

def store_data_into_postgres():
    pass
def vector_store_main(embedding_method,embedding_size,post_embeddings_list,post_ids_list,comment_embeddings_list,comment_ids_list,vector_store,
                      vector_index_method):
    if vector_store=="chromadb":
        store_data_into_vector_store_chromadb(embedding_method,post_embeddings_list,post_ids_list,comment_embeddings_list,comment_ids_list)
    elif vector_store=="postgres":
        store_data_into_postgres()
    elif vector_store=="milvus":
        store_data_into_vector_store_milvus(embedding_method,post_embeddings_list,post_ids_list,comment_embeddings_list,comment_ids_list,vector_index_method,
                                            embedding_size)
    else:
        import pdb;pdb.set_trace()



In [None]:
!pip install vllm

In [10]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
number_gpus = 1
max_model_len = 8192

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)


ModuleNotFoundError: No module named 'vllm'

In [9]:
model = SentenceTransformer('sentence-transformers/sentence-t5-large')
x = embed_data_using_stella_model(model,"hello how are you")
print (x.shape)



(768,)


In [8]:
def embed_data_using_stella_model(model,text):
    embeddings = model.encode(text)
    return embeddings
    # query_vectors = None
    # if query_embedding_bool:
    #     with torch.no_grad():
    #         input_data = tokenizer(text, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
    #         input_data = {k: v.cpu() for k, v in input_data.items()}
    #         attention_mask = input_data["attention_mask"]
    #         last_hidden_state = model(**input_data)[0]
    #         last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
    #         query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    #         query_vectors = normalize(vector_linear(query_vectors).cpu().numpy())
    #     if query_vectors is None:
    #         import pdb;pdb.set_trace()
    #     return query_vectors
    # else:
    #     docs_vectors = None
    #     with torch.no_grad():
    #         # print (len(text))
    #         # print ("docs vector")
    #         input_data = tokenizer(text, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
    #         input_data = {k: v.cpu() for k, v in input_data.items()}
    #         attention_mask = input_data["attention_mask"]
    #         # print (input_data)
    #         last_hidden_state = model(**input_data)[0]
    #         last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
    #         docs_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    #         docs_vectors = normalize(vector_linear(docs_vectors).cpu().numpy())
    #         # import pdb;pdb.set_trace()
    #     # if docs_vectors is None:
    #     #     import pdb;pdb.set_trace()
    #     return docs_vectors

In [21]:
post_id_to_posts_text_mapping = {k: post_id_to_posts_text_mapping[k] for k in list(post_id_to_posts_text_mapping)[:10]}
comments_ids_to_comment_text_mappings = {k: comments_ids_to_comment_text_mappings[k] for k in list(comments_ids_to_comment_text_mappings)[:10]}
embedding_types = ["sentence-t5-large'"]
# stella_en_400M_v5_gguf
vector_stores = ['chromadb','milvus']
for et in embedding_types:
    print ("et",et)
    post_id_post_embeddings_mapping = {}
    comment_id_comment_embeddings_mapping = {}
    for item,value in post_id_to_posts_text_mapping.items():
        post_embedding = embedding_main(et,value,False)
        post_id_post_embeddings_mapping[item] = post_embedding
    for item,value in comments_ids_to_comment_text_mappings.items():
        comment_embedding = embedding_main(et,value,False)
        comment_id_comment_embeddings_mapping[item] = comment_embedding
    post_embeddings_list = []
    post_embeddings_list = post_embeddings_list +  [v for k,v in post_id_post_embeddings_mapping.items()]
    post_ids_list = list(post_id_post_embeddings_mapping.keys())
    post_ids_list = [str(i) for i in post_ids_list]
    comment_embeddings_list = []
    comment_embeddings_list = comment_embeddings_list + [v for k,v in comment_id_comment_embeddings_mapping.items()]
    comment_ids_list = list(comment_id_comment_embeddings_mapping.keys())
    comment_ids_list = [str(i) for i in comment_ids_list]

    if et == "gpt":
        embedding_size = 1536
    else:
        embedding_size = 1024
    post_ids_list = [str(i) for i in post_ids_list]
    comment_ids_list = [str(i) for i in comment_ids_list]
    vector_store_main(et,embedding_size,post_embeddings_list,post_ids_list,comment_embeddings_list,
                  comment_ids_list,'chromadb',None)
    vector_store_main(et,embedding_size,post_embeddings_list,post_ids_list,comment_embeddings_list,
                  comment_ids_list,'milvus',"FLAT")
    vector_store_main(et,embedding_size,post_embeddings_list,post_ids_list,comment_embeddings_list,
                  comment_ids_list,'milvus',"HNSW")
    # vector_store_main(et,embedding_size,post_embeddings_list,post_ids_list,comment_embeddings_list,
    #               comment_ids_list,'milvus',"BIN_FLAT")






Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


et stella_en_400M_v5_gguf


  torch.load("pytorch_model.bin",map_location=torch.device('cpu')).items()
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  torch.load("pytorch_model.bin",map_location=torch.device('cpu')).items()


10
10
<class 'int'>
<class 'numpy.ndarray'>


ValueError: Embedding contains non-float values

In [34]:
import numpy
for out_index in range(len(post_embeddings_list)):
    for in_index in range(len(post_embeddings_list)):
        if isinstance((post_embeddings_list[out_index][in_index]),numpy.float32):
            print (type(post_embeddings_list[out_index][in_index]))

<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.float32'>
<class 'numpy.fl

In [None]:
!set COMMANDLINE_ARGS=--xformers --reinstall-xformers
set XFORMERS_PACKAGE=xformers==0.0.17

In [None]:
!pip install -U bitsandbytes

In [None]:
# !mkdir my_model_directory
# !curl -o my_model_directory/config.json https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/config.json
# !curl -o my_model_directory/pytorch_model.bin https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/pytorch_model.bin
# !curl -o my_model_directory/tokenizer.json https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/tokenizer.json
# !curl -o my_model_directory/special_tokens_map.json https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/special_tokens_map.json
# !curl -o my_model_directory/model.safetensors https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/model.safetensors
# !curl -o my_model_directory/vocab.txt https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/vocab.txt
# # curl -o my_model_directory/vocab.txt https://huggingface.co/anuna-mbrown/stella_en_400M_v5_gguf/tree/main/stella_en_400M_v5.gguf/main/vocab.txt