In [None]:
# Import necessary libraries
import os, re, torch, hashlib, uuid, json, redis
import numpy as np
from transformers import AutoTokenizer, AutoModel
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter  # Import other text splitters as needed
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.redis import Redis
from langchain.schema import Document  # Import Document abstraction
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Directory paths and metadata
corpora = {
    "/home/jamessmithies/Dropbox/Technical/projects/aiinfra/vector_sources/1901/au/hofreps/txt": "1901-au",
    "/home/jamessmithies/Dropbox/Technical/projects/aiinfra/vector_sources/1901/nz/hofreps/txt": "1901-nz",
    "/home/jamessmithies/Dropbox/Technical/projects/aiinfra/vector_sources/1901/uk/hofcoms/txt": "1901-uk"
}

# Access the EMBEDDING_MODEL environment variable
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')

# Access the REDIS_URL environment variable
REDIS_URL = os.getenv('REDIS_URL')

# Access the INDEX_NAME environment variable
INDEX_NAME = os.getenv('INDEX_NAME')

# Access the TEXT_SPLITTER_TYPE, CHUNK_SIZE, and CHUNK_OVERLAP environment variables
TEXT_SPLITTER_TYPE = os.getenv('TEXT_SPLITTER_TYPE', 'RecursiveCharacterTextSplitter')
CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 500))
CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 75))

# Function to initialize the text splitter based on the type
def get_text_splitter(splitter_type, chunk_size, chunk_overlap):
    if splitter_type == 'RecursiveCharacterTextSplitter':
        return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif splitter_type == 'CharacterTextSplitter':
        return CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    else:
        raise ValueError(f"Unsupported text splitter type: {splitter_type}")

# Connect to Redis, either locally (if installed) or Redis Cloud (details in .env)
redis_client = redis.Redis.from_url(REDIS_URL)

# Load a tokenizer and model for embedding
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
model = AutoModel.from_pretrained(EMBEDDING_MODEL)

# Function to compute embeddings using Langchain
def compute_embedding(text):
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    except Exception as e:
        print(f"Error computing embedding for text: {text[:50]}... - {str(e)}")
        return None

# Extract date from filename
def extract_date_from_filename(filename):
    print(f"Extracting date from filename: {filename}")  # Debug print
    match = re.search(r'(\w+, \d{1,2}(?:st|nd|rd|th)? \w+, \d{4})', filename)
    if match:
        date = match.group(0)
        print(f"Extracted date: {date}")  # Debug print
        return date
    print("No date found, returning 'Unknown Date'")  # Debug print
    return "Unknown Date"

# Extract URL from text
def extract_url(text):
    matches = re.finditer(r'<url>(https?://[^\s]+)</url>', text)
    return [(match.start(), match.group(1)) for match in matches]

# Extract page number from text
def extract_page_number(text):
    matches = re.finditer(r'<page>(\d+)</page>', text)
    return [(match.start(), match.group(1)) for match in matches]

# Modify the generate_unique_key function to include corpus metadata
def generate_unique_key(base_key, chunk_idx, corpus_metadata):
    return f"{base_key}:{corpus_metadata}:{chunk_idx}"

# Update the process_corpus function to pass the corpus metadata
def process_corpus(directory, metadata):
    # Load documents using Langchain's DirectoryLoader
    loader = DirectoryLoader(directory, glob="*.txt")
    documents = loader.load()
    
    # Initialize the text splitter
    text_splitter = get_text_splitter(TEXT_SPLITTER_TYPE, CHUNK_SIZE, CHUNK_OVERLAP)
    
    chunk_counter = 0
    texts = []
    metadatas = []
    embeddings = []
    
    for doc in documents:
        # Extract metadata from the entire document content
        date_info = extract_date_from_filename(doc.metadata['source'])
        url_info = extract_url(doc.page_content)
        page_info = extract_page_number(doc.page_content)
        
        # Log the extracted metadata
        print(f"Extracted URL: {url_info} from document: {doc.metadata['source']}")
        print(f"Extracted Page Numbers: {page_info} from document: {doc.metadata['source']}")
        
        # Check if there is a URL at the top of the document
        top_url = url_info[0][1] if url_info else None
        
        # Check if there are any page tags
        if not page_info:
            # Process the entire document as a single section
            current_url = top_url
            clean_section = re.sub(r'<url>https?://[^\s]+</url>', '', doc.page_content)
            chunked_texts = text_splitter.split_text(clean_section)
            for chunk_idx, chunk in enumerate(chunked_texts):
                embedding = compute_embedding(chunk)
                if embedding is not None:
                    # Generate a unique key for each chunk
                    redis_key = generate_unique_key(f"doc:{INDEX_NAME}", chunk_counter, metadata)
                    chunk_counter += 1
                    
                    # Create metadata in the required format
                    metadata_dict = {
                        "source": doc.metadata['source'],
                        "date": date_info,
                        "url": current_url,
                        "page": None,
                        "loc": json.dumps({
                            "lines": {
                                "from": chunk_idx * (CHUNK_SIZE - CHUNK_OVERLAP) + 1,
                                "to": (chunk_idx + 1) * CHUNK_SIZE
                            }
                        }),
                        "corpus": metadata 
                    }
                    
                    # Append to lists for batch processing
                    texts.append(chunk)
                    metadatas.append(metadata_dict)
                    embeddings.append(embedding.tolist())
        else:
            # Split the document into sections based on <page> tags
            sections = re.split(r'(<page>\d+</page>)', doc.page_content)
            
            current_page = None
            current_url = None
            for section in sections:
                if section.startswith('<page>'):
                    current_page = int(re.search(r'<page>(\d+)</page>', section).group(1))
                    # If there is no URL under page tags, use the top URL
                    if not any(url for pos, url in url_info if pos > section.find('<page>')):
                        current_url = top_url
                else:
                    # Update the current URL if the section contains a new URL tag
                    for pos, url in url_info:
                        if section.find(url) != -1:
                            current_url = url
                            break
                    
                    if current_page is not None:
                        # Remove URL tags from the section
                        clean_section = re.sub(r'<url>https?://[^\s]+</url>', '', section)
                        
                        chunked_texts = text_splitter.split_text(clean_section)
                        for chunk_idx, chunk in enumerate(chunked_texts):
                            embedding = compute_embedding(chunk)
                            if embedding is not None:
                                # Generate a unique key for each chunk
                                redis_key = generate_unique_key(f"doc:{INDEX_NAME}", chunk_counter, metadata)
                                chunk_counter += 1
                                
                                # Create metadata in the required format
                                metadata_dict = {
                                    "source": doc.metadata['source'],
                                    "date": date_info,
                                    "url": current_url,
                                    "page": current_page,
                                    "loc": json.dumps({
                                        "lines": {
                                            "from": chunk_idx * (CHUNK_SIZE - CHUNK_OVERLAP) + 1,
                                            "to": (chunk_idx + 1) * CHUNK_SIZE
                                        }
                                    }),
                                    "corpus": metadata 
                                }
                                
                                # Append to lists for batch processing
                                texts.append(chunk)
                                metadatas.append(metadata_dict)
                                embeddings.append(embedding.tolist())

    # Add texts to the vector store to create the index
    vector_store.add_texts(texts, metadatas=metadatas, embeddings=embeddings)

    print(f"Finished processing corpus: {metadata}")

# Initialize HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Initialize Redis vector store
vector_store = Redis(
    redis_url=REDIS_URL,
    embedding=embeddings,
    index_name=INDEX_NAME,
)

# Process each corpus
for directory, metadata in corpora.items():
    process_corpus(directory, metadata)