In [None]:
!pip install faiss-cpu sentence-transformers tqdm

In [None]:
import glob
import json
import os
import pickle
import logging
from tqdm import tqdm
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Configure logging for error tracking
logging.basicConfig(
    filename="faiss_errors.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.ERROR
)

print("All packages imported and logging configured!")


In [None]:
#Load JSONL Files and Prepare Data

def load_jsonl(file_path):
    """Yield JSON objects from a JSONL file with error logging."""
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                logging.error(f"JSON decode error in file {file_path}: {e}")

# Specify the directory containing your JSONL files.
jsonl_directory = "/Users/ammar/Desktop/Dissertation/Dataset"
jsonl_files = glob.glob(os.path.join(jsonl_directory, "*.jsonl"))
print(f"Found {len(jsonl_files)} JSONL files in {jsonl_directory}")

# Initialize lists to hold texts and metadata.
all_texts = []
doc_ids = []
chunk_nums = []

# Process each JSONL file to extract texts and metadata.
for file in jsonl_files:
    print(f"Processing file: {file}")
    for data in load_jsonl(file):
        text = data.get("text")  # Assumes the key for text is "text"
        if text:
            # Optional: Truncate text if necessary (e.g., to 4096 bytes)
            encoded_text = text.encode("utf-8")
            if len(encoded_text) > 4096:
                print(f"Truncating text in file {file}")
                text = encoded_text[:4096].decode("utf-8", errors="ignore")
            all_texts.append(text)
            doc_ids.append(data.get("doc_id", ""))
            chunk_nums.append(data.get("chunk_num", 0))

print(f"Loaded {len(all_texts)} text chunks from {len(jsonl_files)} files.")


In [None]:
# Build FAISS Index and Save Files

# Load the SentenceTransformer model for generating embeddings.
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded.")

# Define FAISS index parameters.
dim = 384  # Dimension for all-MiniLM-L6-v2 embeddings
index = faiss.IndexFlatL2(dim)

# Process texts in batches to compute embeddings and add them to the FAISS index.
batch_size = 1000
num_texts = len(all_texts)
print("Starting batch embedding and FAISS index construction...")

for i in tqdm(range(0, num_texts, batch_size), desc="Processing batches"):
    batch_texts = all_texts[i:i+batch_size]
    try:
        batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar=False)
        index.add(np.array(batch_embeddings).astype("float32"))
    except Exception as e:
        logging.error(f"Error processing batch starting at index {i}: {e}")

print(f"FAISS index built with {index.ntotal} embeddings.")

# Save the FAISS index and metadata files.
faiss.write_index(index, "my_index.idx")
with open("all_texts.pkl", "wb") as f:
    pickle.dump(all_texts, f)
with open("doc_ids.pkl", "wb") as f:
    pickle.dump(doc_ids, f)
with open("chunk_nums.pkl", "wb") as f:
    pickle.dump(chunk_nums, f)

print("FAISS index and metadata files saved!")