## Index Build and Update Script

Whoops!  I only had the initial build script but nothing to actually update it with new documents.  This notebook will take care of that as well.

In [1]:
import faiss
import json

import nltk
nltk.download('punkt')  # Download the punkt tokenizer for sentence tokenization

import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import uuid

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hugh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#################################
##########  VARIABLES  ##########
#################################

index_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'
index_transformer_model = SentenceTransformer(index_transformer_model_name)

### Load the Existing Document Lookups

In [3]:

data_dir = os.getcwd()
docs_dir = os.path.join(data_dir, 'data')
document_guid_lookup_csv = os.path.join(data_dir, 'document_guid_lookup.csv')
document_guid_lookup_df = None

def get_document_guid_lookup():
    global document_guid_lookup_df
    # See if the DataFrame already exists.  Load it or create it if it doesn't
    
    if document_guid_lookup_df is None:
        # Check if the CSV file exists
        if not os.path.exists(document_guid_lookup_csv):
            # If the file doesn't exist, create a new DataFrame
            # Create an empty DataFrame with column names
            column_names = ['doc_guid', 'doc_filename']
            document_guid_lookup_df = pd.DataFrame(columns=column_names)

            # Save the DataFrame to the CSV file
            document_guid_lookup_df.to_csv(document_guid_lookup_csv, index=False)
            print(f"CSV file created at: {document_guid_lookup_csv}")
        else:
            # If the file already exists, load it into a DataFrame
            document_guid_lookup_df = pd.read_csv(document_guid_lookup_csv)
            print(f"CSV file already exists at: {document_guid_lookup_csv}")
            print("Loaded existing DataFrame:")
            
    return document_guid_lookup_df

document_guid_lookup_df = get_document_guid_lookup()
document_guid_lookup_df.head()

CSV file already exists at: D:\JupyterPrograms\0-CHAT_GPT\EXPERIMENTS\Sentence Transformers and FAISS\document_guid_lookup.csv
Loaded existing DataFrame:


Unnamed: 0,doc_guid,doc_filename


### Load an existing index or build from scratch

In [4]:
index_name = 'sentences.index'
mapping_name = 'sentence_to_index_mapping.json'
index = None
sentence_to_index_mapping = None

def add_document_to_index(guid, file_name):   
    global index
    global sentence_to_index_mapping
    global index_transformer_model
    
    file_path = os.path.join(docs_dir, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()

        sentences = nltk.sent_tokenize(document)
        sentence_embeddings = index_transformer_model.encode(sentences, convert_to_tensor=True)                  

        for sentence_idx, embedding in enumerate(sentence_embeddings):
            # Add sentence embedding to the index
            index.add(np.expand_dims(embedding, axis=0))

            # Track the mapping between sentence index and its embedding index
            sentence_to_index_mapping[len(sentence_to_index_mapping)] = {
                'document_index': guid,
                'sentence_index': sentence_idx,
                'sentence_text': sentences[sentence_idx]  # Save the actual sentence
            }

        print("Length of the index:", index.ntotal)
    

def load_existing_index():    
    global index
    global sentence_to_index_mapping
    if os.path.exists(index_name) and os.path.exists(mapping_name):
        index = faiss.read_index(index_name)
        print(f"Loaded index from {index_name}")

        with open(mapping_name, 'r') as mapping_file:
            sentence_to_index_mapping = json.load(mapping_file)
        print(f"Loaded sentence_to_index_mapping from {mapping_name}")
    else:
        print('Index does not exist.  Attempting to build it.')
        # Populate the index and track the sentence ids and locations
        index = faiss.IndexFlatL2(768)  # Create an index
        # Maintain a mapping between sentence embeddings' index and their original sentences
        sentence_to_index_mapping = {}
        
        # Load the docs into the index
        for _, row in document_guid_lookup_df.iterrows():
            guid = row['doc_guid']
            file_name = row['doc_filename']
            add_document_to_index(guid, file_name)
            
        # Save the index and mapping
        faiss.write_index(index, index_name)
        with open(mapping_name, 'w') as mapping_file:
            json.dump(sentence_to_index_mapping, mapping_file)

    
load_existing_index()
print("Final length of the index:", index.ntotal)
print('sentence_to_index_mapping', sentence_to_index_mapping)

Loaded index from sentences.index
Loaded sentence_to_index_mapping from sentence_to_index_mapping.json
Final length of the index: 0
sentence_to_index_mapping {}


### Update an existing index

Updating the index involves checking for new documents, updating the document_guid_lookup_df, and loading their sentences into the index and the sentence_to_index_mapping.  Finally, save all three files.

In [5]:

def update_index_and_mappings(new_entries):   
    global index
    global sentence_to_index_mapping
    
    for record in new_entries:
        guid = record['doc_guid']
        file_name = record['doc_filename']
        print('Updating', guid, file_name)
        add_document_to_index(guid, file_name)
            
    # Save the index and mapping
    faiss.write_index(index, index_name)
    with open(mapping_name, 'w') as mapping_file:
        json.dump(sentence_to_index_mapping, mapping_file)

# TODO: This should cause an update of the FAISS index and sentence_to_index_mappings file
def update_document_guid_lookup_df():  
    global document_guid_lookup_df  
    if document_guid_lookup_df is None:
        # Load the existing one or create a new one
        get_document_guid_lookup()
    
    # Get the list of files in the documents directory
    existing_files = set(document_guid_lookup_df['doc_filename'].tolist())
    files_to_process = [file for file in os.listdir(docs_dir) if os.path.isfile(os.path.join(docs_dir, file))]
    
    # Check for files that are not present in the DataFrame and add them
    new_entries = []
    for file_name in files_to_process:
        if file_name not in existing_files:
            # Generate a unique ID for the new document
            unique_id = str(uuid.uuid4())

            # Prepare a new entry for the DataFrame
            new_entry = {'doc_guid': unique_id, 'doc_filename': file_name}
            new_entries.append(new_entry)

    # Concatenate new entries to the existing DataFrame
    if new_entries:
        print('There are new entries:', new_entries)
        new_df = pd.DataFrame(new_entries)
        document_guid_lookup_df = pd.concat([document_guid_lookup_df, new_df], ignore_index=True)

        # Save the updated DataFrame to the CSV file
#         document_guid_lookup_df.to_csv(document_guid_lookup_csv, index=False)
        update_index_and_mappings(new_entries)
        print("Documents checked and DataFrame updated successfully.")
    else:
        print('No new documents were found.  No DataFrame update required.')
#     return new_entries

update_document_guid_lookup_df()
# new_entries = update_document_guid_lookup_df()
# new_entries

There are new entries: [{'doc_guid': 'f6126e61-5598-4393-82a3-9f2e47e78d27', 'doc_filename': 'AI_1.txt'}, {'doc_guid': 'dae8f392-ab50-4f9f-9388-19958fe90a5f', 'doc_filename': 'AI_2.txt'}, {'doc_guid': '921de1d3-0f4e-41ca-a294-bef7ed930d3d', 'doc_filename': 'AI_3.txt'}, {'doc_guid': '2b7f293c-6b45-447d-9978-eb90f67a8501', 'doc_filename': 'AI_4.txt'}, {'doc_guid': '1e7cb242-9c00-40fa-b999-dda1f084514a', 'doc_filename': 'AI_5.txt'}, {'doc_guid': 'c7674a32-1077-4b17-b766-46c9a8bce70e', 'doc_filename': 'AI_6.txt'}, {'doc_guid': 'e4239869-6536-4771-bc3b-b0ec3ab567ce', 'doc_filename': 'ClimateChange_1.txt'}, {'doc_guid': '13b52eb1-6c72-4faa-977e-dadc05c7b598', 'doc_filename': 'CulturalDiversityAndTraditions_1.txt'}, {'doc_guid': 'fc9b5a49-ed22-449c-9984-61b243c46646', 'doc_filename': 'FinancialMarkets_1.txt'}, {'doc_guid': '2f9e345e-a413-4bcf-bb90-2f717d9850a6', 'doc_filename': 'HistoryAndHistoricalEvents_1.txt'}, {'doc_guid': '6ec3579c-3800-41a4-9820-2df1b0cd2726', 'doc_filename': 'Terrorism