In [None]:
# Import necessary libraries
import os  # For interacting with the operating system (e.g., file paths)
import pandas as pd  # For working with data in table format
import numpy as np  # For numerical operations
from transformers import DistilBertTokenizer, DistilBertModel  # For processing and embedding text
import torch  # For machine learning operations
import PyPDF2  # For reading PDF files
import logging  # For logging messages (errors, info, etc.)
from tqdm import tqdm  # For showing progress bars
import warnings  # For handling warning messages

# Set up logging to keep track of what's happening in the program
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Ignore some specific warnings that aren't important for our task
warnings.filterwarnings("ignore", category=UserWarning, module="pandas.core.arrays.masked")
warnings.filterwarnings("ignore", category=UserWarning, module="PyPDF2")

# Initialize the DistilBERT model and tokenizer
# This is a pre-trained AI model that helps us understand and process text
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Set up the device (GPU if available, otherwise CPU) for faster processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the chosen device

def read_file(file_path):
    """
    Read content from CSV, PDF, or TXT file.
    This function tries to read different file types and return their content.
    """
    # Get the file extension (e.g., .csv, .pdf, .txt)
    file_type = os.path.splitext(file_path)[1].lower()
    try:
        if file_type == '.csv':
            # If it's a CSV file, read it and join all the text
            df = pd.read_csv(file_path)
            return ' '.join(df.astype(str).values.flatten())
        elif file_type == '.pdf':
            # If it's a PDF, use the read_pdf function (defined below)
            return read_pdf(file_path)
        elif file_type == '.txt':
            # If it's a text file, simply read its contents
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        else:
            # If it's not a supported file type, log a warning and return None
            logger.warning(f"Unsupported file type: {file_path}")
            return None
    except Exception as e:
        # If there's any error in reading the file, log it and return None
        logger.error(f"Error reading file {file_path}: {str(e)}")
        return None

def read_pdf(file_path):
    """
    Read content from PDF file with error handling.
    This function tries to extract text from each page of a PDF file.
    """
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                try:
                    # Try to extract text from each page
                    text += page.extract_text() + "\n"
                except Exception as e:
                    # If there's an error with a specific page, log it and continue
                    logger.warning(f"Error extracting text from page in {file_path}: {str(e)}")
            return text if text.strip() else None
    except PyPDF2.errors.PdfReadError as e:
        # If there's an error reading the PDF, log it and return None
        logger.error(f"PyPDF2 error reading {file_path}: {str(e)}")
        return None
    except Exception as e:
        # If there's any other unexpected error, log it and return None
        logger.error(f"Unexpected error reading PDF {file_path}: {str(e)}")
        return None

def chunk_text(text, chunk_size=1000):
    """
    Split text into chunks.
    This function breaks long text into smaller pieces for easier processing.
    """
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def get_embedding(text):
    """
    Generate embedding for a single text using DistilBERT.
    """
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def batch_get_embeddings(texts, batch_size=32):
    """
    Generate embeddings for a batch of texts.
    This processes multiple texts at once for efficiency.
    """
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings", unit="batch"):
        batch = texts[i:i+batch_size]
        batch_embeddings = [get_embedding(text) for text in batch]
        embeddings.extend(batch_embeddings)
    return embeddings

def process_documents(input_dir, output_csv):
    """
    Process documents and create vector database.
    This function reads all documents in a directory, processes them, and saves the results.
    """
    all_chunks = []
    processed_files = 0
    skipped_files = 0
    chunk_id = 0
    
    # Walk through all files in the input directory
    for root, _, files in os.walk(input_dir):
        for file in tqdm(files, desc="Processing files"):
            file_path = os.path.join(root, file)
            document_id = os.path.splitext(file)[0]
            text = read_file(file_path)
            if text:
                chunks = chunk_text(text)
                for chunk in chunks:
                    all_chunks.append({
                        'chunk_id': chunk_id,
                        'document_id': document_id,
                        'document_file': file_path,
                        'chunk_text': chunk
                    })
                    chunk_id += 1
                processed_files += 1
            else:
                skipped_files += 1

    if not all_chunks:
        logger.info("No documents processed successfully.")
        return

    logger.info(f"Generating embeddings for {len(all_chunks)} chunks")
    df = pd.DataFrame(all_chunks)
    df['vector_embedding'] = batch_get_embeddings(df['chunk_text'].tolist())

    logger.info(f"Saving to {output_csv}")
    # Convert embeddings to string for CSV storage
    df['vector_embedding'] = df['vector_embedding'].apply(lambda x: ','.join(map(str, x)))
    df.to_csv(output_csv, index=False)
    logger.info(f"Processed {processed_files} files successfully, skipped {skipped_files} files, created {len(df)} chunks")

def search_similar(query, df, top_k=5):
    """
    Search for similar chunks.
    """
    query_embedding = get_embedding(query)
    df['vector_embedding'] = df['vector_embedding'].apply(lambda x: np.fromstring(x, sep=','))
    df['similarity'] = df['vector_embedding'].apply(lambda x: np.dot(x, query_embedding))
    return df.sort_values('similarity', ascending=False).head(top_k)

if __name__ == "__main__":
    input_dir = "documents"
    output_csv = "vector_final_db.csv"
    process_documents(input_dir, output_csv)
    # df = pd.read_csv(output_csv)
    # query = "deep learning in computer vision"
    # results = search_similar(query, df)
    # print(results[['chunk_text', 'document_file', 'similarity']])