In [1]:
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import numpy as np
import faiss
import json
import os
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Text chunk function
def retrieve_file_text_chunks(file_path):

    with open(file_path, "r", encoding="utf-8") as f:
        all_chunks = json.load(f)

    return all_chunks

# Embedding Functions
def embedding_engine_helper_function(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:

    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)

    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def process_embedding_batch(batch):
    input_text = ["query: " + s for s in batch]
    batch_dict = tokenizer(input_text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**batch_dict)
    embeddings = embedding_engine_helper_function(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings_norm = F.normalize(embeddings, p=2, dim=1).detach().numpy()
    del embeddings
    del batch_dict
    del outputs
    return embeddings_norm

def embedding_engine(input):
    batch_size = 50
    embeddings = []

    for i in range(0, len(input), batch_size):
        batch = input[i:i + batch_size]
        batch_embeddings = process_embedding_batch(batch)
        embeddings.append(batch_embeddings)

        del batch_embeddings
        gc.collect()
        torch.cuda.empty_cache()

    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

# Process documents functions
def generate_index_for_documents(base_path):
    with open(f"{base_path}/schema.json", "r", encoding="utf-8") as file:
        schema_file = json.load(file)

    for schema in schema_file:
        path_id = schema["path_id"]

        file_path = f"{base_path}/pdfs/{path_id}/{path_id}.json"
        index_path = f"{base_path}/pdfs/{path_id}/{path_id}.index"

        if (os.path.exists(index_path)):
            print(f"❌Index already exists for: {path_id}")
            continue
        else:
            print(f"Starting file: {schema['title']}")
        

        chunks = retrieve_file_text_chunks(file_path)
        print(f"Chunks split: {len(chunks)}")

        embeddings = embedding_engine(chunks)

        print("Adding vectors to index.")
        index = faiss.IndexFlatIP(768)
        index.add(np.array(embeddings).astype('float32'))
        faiss.write_index(index, index_path)
        print(f"✅Processed Source: {schema['title']}")
        del embeddings
        del index
        gc.collect()

def combine_indexes_for_subsections(base_path, section_name):
    with open(f"{base_path}/schema.json", "r", encoding="utf-8") as file:
        schema_file = json.load(file)

    main_index = faiss.IndexFlatIP(768)
    main_index_path = f"{base_path}/{section_name}.index"

    new_schema_file = []
    for schema in schema_file:
        path_id = schema["path_id"]
        index_path = f"{base_path}/pdfs/{path_id}/{path_id}.index"

        if (os.path.exists(main_index_path)):
            print(f"❌Index already exists for: {path_id}")
            continue

        sub_index = faiss.read_index(index_path)

        vectors = sub_index.reconstruct_n(0, sub_index.ntotal)
    
        schema["start"] = main_index.ntotal
        schema["end"] = main_index.ntotal + sub_index.ntotal - 1

        new_schema_file.append(schema)

        main_index.add(vectors)


        print(f"Vectors: {sub_index.ntotal}")
        print(f"✅Processed Source: {schema['title']}")

    faiss.write_index(main_index, main_index_path)
    with open(f"{base_path}/schema_with_indices.json", 'w') as f:
        json.dump(new_schema_file, f, indent=4)


In [4]:
generate_index_for_documents("federal/leyes_federales")
generate_index_for_documents("federal/reglamentos_federales")

generate_index_for_documents("state/leyes_estatales")
generate_index_for_documents("state/reglamentos_estatales")
generate_index_for_documents("state/reglamentos_municipales/monterrey")

❌Index already exists for: 4c6edad2-5b2d-45eb-b1db-b9d2a35a3614
❌Index already exists for: ce8f0109-7277-44e7-8cee-9dc5ee7dff5c
❌Index already exists for: 95f1de2f-100d-4ca2-9458-49f03c2095f8
❌Index already exists for: f7d1b1d7-e6e8-49ca-9b30-8fa8fbfad6a3
❌Index already exists for: 08561a2f-225a-45c0-950e-95f12eaae905
❌Index already exists for: a7f8054e-f34e-4f0a-ba25-8522d316b512
❌Index already exists for: 20bea2d4-76f6-4312-8000-59fae0bf82ea
❌Index already exists for: d59cbc71-8e88-4482-9a24-7933e1052841
❌Index already exists for: 61af3b55-b43c-445d-a5ce-0bfb249a71aa
❌Index already exists for: f725f488-4f24-4762-96a6-780c4c8b4573
❌Index already exists for: 899c59da-1d5b-49f3-8156-7977b58e252e
❌Index already exists for: 6538bc3d-435f-4d5f-8f06-b3dcbc6da3cf
❌Index already exists for: a5e0447c-3116-4b8f-9cfe-39462445da91
❌Index already exists for: 75d84b6b-1d6e-4d7a-b603-74f85bc574b2
❌Index already exists for: a03f3afc-9fa9-4271-bcf2-0a53f8b92b83
❌Index already exists for: f67b53fa-c820

In [5]:
combine_indexes_for_subsections("federal/leyes_federales", "leyes_federales")
combine_indexes_for_subsections("federal/reglamentos_federales", "reglamentos_federales")

combine_indexes_for_subsections("state/leyes_estatales", "leyes_estatales")
combine_indexes_for_subsections("state/reglamentos_estatales", "reglamentos_estatales")
combine_indexes_for_subsections("state/reglamentos_municipales/monterrey", "monterrey")

Vectors: 60
✅Processed Source: Ley del Instituto Nacional de las Mujeres
Vectors: 43
✅Processed Source: Ley Federal de Austeridad Republicana
Vectors: 150
✅Processed Source: Ley General de Protección Civil
Vectors: 275
✅Processed Source: Ley para Regular las Agrupaciones Financieras
Vectors: 147
✅Processed Source: Ley Federal de Armas de Fuego y Explosivos
Vectors: 268
✅Processed Source: Ley Federal de Transparencia y Acceso a la Información Pública
Vectors: 50
✅Processed Source: Ley de Fomento para la Lectura y el Libro
Vectors: 250
✅Processed Source: Ley de Desarrollo Rural Sustentable
Vectors: 45
✅Processed Source: Ley Federal para el Control de Precursores Químicos, Productos Químicos Esenciales y...
Vectors: 150
✅Processed Source: Ley Federal para la Administración y Enajenación de Bienes del Sector Público
Vectors: 60
✅Processed Source: Ley Sobre Delitos de Imprenta
Vectors: 204
✅Processed Source: Ley Federal de Competencia Económica
Vectors: 40
✅Processed Source: Ley Federal de 

In [None]:
def delete_json_files(folder):
    for root, dirs, files in os.walk(folder):
        for file in files:
            if (file.endswith('.index') or file.endswith('.npy')) and not ("schema" in file):
                file_path = os.path.join(root, file)
                os.remove(file_path)
                print(f'Removed file: {file_path}')

# delete_json_files("federal/leyes_federales")
# delete_json_files("federal/reglamentos_federales")

# delete_json_files("state/leyes_estatales")
# delete_json_files("state/reglamentos_estatales")
# delete_json_files("state/reglamentos_municipales/monterrey")