## Imports

In [4]:
import os
import fitz  # PyMuPDF
from docx import Document

## Proof of concept

In [13]:
def read_text_from_file(file_path):
    """
    Reads text from a file with supported formats (.docx, .pdf, .txt).

    Parameters:
    file_path (str): The path to the file to be read.

    Returns:
    str: The text extracted from the file.
    """
    if file_path.endswith('.docx'):
        text = docx2txt.process(file_path)
    elif file_path.endswith('.pdf'):
        with open(file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ''
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
    elif file_path.endswith('.txt'):
        try:
            with open(file_path, 'r', encoding='utf-8') as txt_file:
                text = txt_file.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='latin-1') as txt_file:
                text = txt_file.read()
            print(f"Error: Could not decode text from {file_path}. Non UTF-8 character.")

    return text

def chunk_text(text, chunk_size=4000):
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        yield ' '.join(tokens[i:i+chunk_size])

def vectorize_text_with_openai(text_chunks):
    vectors = []
    for chunk in text_chunks:
        response = openai.Embedding.create(
            model="text-embedding-ada-002",  # Updated to use the new model naming convention
            input=chunk  # The new API expects a string directly if it's a single input
        )
        vectors.append(response['data'][0]['embedding'])
    return vectors

def aggregate_vectors(vectors):
    return np.mean(vectors, axis=0)

def store_vector_in_chromadb(vector, group_id, text):
    # Placeholder for storing the vector in ChromaDB
    # Implement according to your ChromaDB setup
    client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="db/"))
    try:
        collection = client.get_collection(name="policy_files", embedding_function="cosine")
    except:
        collection = client.create_collection(name="policy_files", embedding_function="cosine")
        
    collection.add(
        documents=[text],
        embeddings=[vector],
        metadatas=[{"source": group_id}],
        ids=[group_id]
    )

def process_files(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = read_text_from_file(file_path)

        text_chunks = list(chunk_text(text))
        vectors = vectorize_text_with_openai(text_chunks)
        aggregated_vector = aggregate_vectors(vectors)
        group_id = filename  # Using filename as group ID, modify as needed
        store_vector_in_chromadb(aggregated_vector, group_id, text)
        

In [1]:
# Example usage
folder_path = 'D:\\My Projects\\Policy Chatbot\\rag-model\\files'
process_files(folder_path)

NameError: name 'process_files' is not defined

## Here is the real code (Microservices)

### Dynamic file reader

In [11]:
def read_text_from_file(file_path):
    """
    Reads text from a file and splits it into chunks based on a token limit.

    Parameters:
    - file_path (str): The path to the file.

    Returns:
    - tuple: A tuple containing a list of text chunks and the file path.
    """
    text_chunks = []

    if file_path.endswith('.pdf'):
        text_chunks.extend(read_pdf(file_path))
    elif file_path.endswith('.docx'):
        text_chunks.extend(read_docx(file_path))
    elif file_path.endswith('.txt'):
        text_chunks.extend(read_txt(file_path))
    else:
        raise ValueError("Unsupported file type")
    
    return text_chunks

def read_pdf(file_path):
    """
    Reads text from a PDF file and splits it into chunks.

    Parameters:
    - file_path (str): The path to the PDF file.

    Returns:
    - list: A list of text chunks.
    """
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return split_into_chunks(text)

def read_docx(file_path):
    """
    Reads text from a DOCX file and splits it into chunks.

    Parameters:
    - file_path (str): The path to the DOCX file.

    Returns:
    - list: A list of text chunks.
    """
    doc = Document(file_path)
    full_text = '\n'.join(paragraph.text for paragraph in doc.paragraphs)
    return split_into_chunks(full_text)

def read_txt(file_path):
    """
    Reads text from a TXT file and splits it into chunks.

    Parameters:
    - file_path (str): The path to the TXT file.

    Returns:
    - list: A list of text chunks.
    """
    with open(file_path, 'r') as file:
        full_text = file.read()
    return split_into_chunks(full_text)

def split_into_chunks(text, token_limit=8000):
    """
    Splits the given text into chunks with a maximum of token_limit tokens.

    Parameters:
    - text (str): The text to be split.
    - token_limit (int): The maximum number of tokens per chunk.

    Returns:
    - list: A list of text chunks.
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(' '.join(current_chunk + [word])) > token_limit:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [19]:
# Data files folder path
folder_path = "D:\\My Projects\\Policy Chatbot\\rag-model\\files"

# Read each file in the folder, break it into chunks and return a list of chuncks for vectorization
for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text_chunks = read_text_from_file(file_path)
        chunck_size = len(text_chunks)
        print(filename)
        print(chunck_size)
        for chunks in text_chunks:
            print(chunks, end = "\n\n")

Abstract.docx
1
Abstract Recommendation applications are becoming evermore demanding with the abundance of information is available online. In this generation of saving time, recommendation systems help develop viable suggestion for a user based on their characteristics made available through the vast collection of data that has been acquired in the past decade, and evermore in coming years. Skincare or cosmetics is an ever-expanding industry that has become more relevant recently given the amount of research made in the domain for people to increase their longevity and to get the desired skin/face that is perfect and inspiring. Recommendation systems can be divided in to 2 main categories, these being the Content Based Filtering, and Collaborative Based Filtering. This project not only contributes in the form of preparing a dataset, but also presents a complete solution in the form of a mobile application which in rare cases work on both perspectives of a recommendation system. This a

sample-docx-file-for-testing.docx
4
Curabitur bibendum ante urna, sed blandit libero egestas id. Pellentesque rhoncus elit in lacus ultrices fringilla. Nam ac metus eu turpis mattis rutrum. Mauris mattis sem ex, facilisis molestie sapien luctus non. Vestibulum tincidunt urna at odio suscipit, vel congue felis cursus. Etiam tellus magna, egestas ac suscipit in, laoreet quis felis. Proin non orci id dui tincidunt egestas. Vestibulum eleifend, ligula a scelerisque vehicula, risus justo ultricies ligula, et interdum lorem ex eget ex. Duis dignissim lacus vitae velit laoreet, vitae placerat velit aliquet. Etiam eget mollis nulla, ac vehicula mi. Etiam non sollicitudin velit, imperdiet commodo mi. Fusce quis tellus tellus. Donec dictum euismod risus non tempus. Duis quis pellentesque nunc. Praesent elementum condimentum mollis. Phasellus dapibus quam a hendrerit placerat. Sed ultrices blandit nulla sed sodales. Nunc quis volutpat eros. Etiam bibendum eu tellus consequat blandit. Curabitur la