In [None]:
!pip install sentence-transformers
!pip install openai==0.28
!pip install pdfminer.six
!pip install httpx
!pip install supabase

In [None]:
# Make necessary imports
import PyPDF2
import nltk
import re
import os
import httpx
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from supabase import create_client, Client
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import openai


## Configurable parameteres

In [1]:
SUPABASE_URL = "https://tmwcifbhcnkaiqsldmlz.supabase.co"
SUPABASE_SERVICE_API_KEY = "please use your key here"
# ** NOTE ** : Also update supabase creds in "query_supabase_with_embedding" function

openai_api_key = 'please use your key here'


In [None]:
dir_path = 'books/'
supabase_table_name = "books_pretrained_embeddings"
match_function_name = "match_books_pretrained_embeddings"  ## This is cosine similarity match function in supabase
model_name = 'sentence-transformers/all-mpnet-base-v2'  ## Pretrained model name from Hugging face

In [None]:
# Load the pretrained Sentence Transformers model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name)
len(model.encode("hello").tolist())

## Create table and match function in supabase by running following SQL code (Note: Use / update names for table and match function as you provide above )
## Note: update value for vector beased on above vector length value

In [None]:
-- Create table
CREATE TABLE books_pretrained_embeddings (
  source TEXT,
  vector_data vector(768),  -- Array of 768 floats for vector data
  text TEXT,
  text_book TEXT,
  page_number TEXT
);

-- Match function
CREATE OR REPLACE FUNCTION match_books_pretrained_embeddings(
  query_vector VECTOR,
  threshold FLOAT,
  match_count INT
)
RETURNS TABLE (
  source TEXT,
  text TEXT,
  text_book TEXT,
  page_number TEXT,
  cosine_similarity_score FLOAT
)
LANGUAGE sql STABLE
AS $$
  SELECT
    books_pretrained_embeddings.source,
    books_pretrained_embeddings.text,
    books_pretrained_embeddings.text_book,
    books_pretrained_embeddings.page_number,
    1 - (books_pretrained_embeddings.vector_data <=> query_vector) AS cosine_similarity_score
  FROM books_pretrained_embeddings
  WHERE 1 - (books_pretrained_embeddings.vector_data <=> query_vector) > threshold
  ORDER BY cosine_similarity_score DESC
  LIMIT match_count;
$$;

## Prep for knowledge / source text / groundtruth embedding

In [None]:
# Load the pretrained Sentence Transformers model
model = SentenceTransformer(model_name)

# function to list files in a given directory
def list_pdf_files(pdf_path):
    """Lists all PDF files in the given directory."""
    pdf_files = [file for file in os.listdir(pdf_path) if file.endswith('.pdf')]
    return pdf_files

# Tokenize and prepocess the sentence
def preprocess_sentence(sentence):
    """Tokenizes and preprocesses a sentence."""
    tokens = word_tokenize(sentence.lower())
    return tokens

"""Chunks text into sizes of about provided characters, ending at sentence boundaries.
    Parameters:
    text (str): The text to be chunked.
    chunk_size (int): Target size for each chunk in characters.
    overlap (int): Number of characters from the end of one chunk to overlap with the beginning of the next chunk.

    Returns:
    list: A list of text chunks.
    """
def chunk_text(text, chunk_size=1200, overlap=100):
    """Chunks text into sizes of about provided characters, ending at sentence boundaries."""
    # Initialize an empty list to store the chunks
    chunks = []
    sentences = nltk.sent_tokenize(text)
    current_chunk = ""

    # Iterate through each sentence in the tokenized text.
    for sentence in sentences:
        # If adding a sentence to the current chunk exceeds the chunk size
        # and the current chunk is not empty, add the current chunk to the chunks list.
        # Then, start a new chunk with the current sentence.
        if len(current_chunk) + len(sentence) > chunk_size and len(current_chunk) > 0:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            # If the current chunk plus the new sentence is within the limit,
            # add the sentence to the current chunk
            current_chunk += (" " if current_chunk else "") + sentence

        # If the length of the current chunk exceeds the chunk size and there are already chunks in the list,
        # take the last part of the previous chunk (as defined by the overlap) and add it to the current chunk.
        # This creates an overlap between the end of the previous chunk and the start of the current chunk.
        if len(current_chunk) > chunk_size and len(chunks) > 0:
            last_chunk = chunks[-1]
            overlap_text = last_chunk[-overlap:]
            current_chunk = overlap_text + current_chunk

    # After processing all sentences, if there is any remaining text in the current chunk, add it to the chunks list.
    if current_chunk:
        chunks.append(current_chunk)

    return chunks


"""
    Extracts text from each page of a PDF file and generates embeddings for the extracted text.

    Parameters:
    pdf_path (str): The path to the PDF file relative to the directory path.
    dir_path (str): The directory path where the PDF file is located.

    Yields:
    tuple: A tuple containing the text chunk, its embedding, and the page number.
    """
def extract_text_by_page(pdf_path, dir_path):
    """Extracts text from PDF pages using PDFMiner and generates embeddings."""
    file_path = dir_path + pdf_path
    # Iterate through each page in the PDF file.
    print(file_path)
    for page_number, page_layout in enumerate(extract_pages(file_path)):
        text = ""

        # Iterate through each element in the page layout
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # Append the text of the element to the overall text of the page.
                text += element.get_text()

        # Chunk the extracted text into smaller pieces.
        text_chunks = chunk_text(text) 

        # Iterate through each chunk and generate its embedding.
        for chunk in text_chunks:
            vector = model.encode(chunk).tolist()  # Generate the embedding for the chunk of text and convert it to a list.

            # Yield a tuple containing the text chunk, its embedding, and the page number.
            yield chunk, vector, page_number


## To add data to supabase tables. Just run this once.

In [None]:
# Supabase setup
headers = {"apikey": SUPABASE_SERVICE_API_KEY, "Content-Type": "application/json"}

pdf_files = list_pdf_files(dir_path)

# Process each PDF and send data to Supabase
for pdf_path in pdf_files:
    for chunk, vector, page_number in extract_text_by_page(pdf_path, dir_path):
        
        # Prepare the data to be inserted into the database.
        data_to_insert = {
            "page_number": str(page_number),
            "vector_data": vector,
            "text": chunk,
            "text_book": pdf_path
        }

        # Construct the endpoint URL for posting data to Supabase.
        endpoint = f"{SUPABASE_URL}/rest/v1/{supabase_table_name}"

        # Use the httpx library to send a POST request to the Supabase endpoint.
        response = httpx.post(endpoint, headers=headers, json=data_to_insert)
        print(f"Stored data for page {page_number} in '{pdf_path}': {response.status_code}, {response.text}")

## Evaluation

In [None]:
import httpx
from sentence_transformers import SentenceTransformer

"""
    Queries the Supabase database using an embedding generated from the query document.
"""
def query_supabase_with_embedding(query_document, model):
    # Generate a vector for the query document
    query_vector = model.encode(query_document).tolist()  # Convert the text to an embedding list

    # Supabase setup
    SUPABASE_URL = "https://tmwcifbhcnkaiqsldmlz.supabase.co"
    SUPABASE_SERVICE_API_KEY = "Please add your key here"
    headers = {
        "apikey": SUPABASE_SERVICE_API_KEY,
        "Content-Type": "application/json"
    }

    # Prepare the data payload
    data = {
        "query_vector": query_vector,
        "threshold": 0.25,
        "match_count": 5
    }

    # RPC endpoint for executing the function
    endpoint = f"{SUPABASE_URL}/rest/v1/rpc/{match_function_name}"

    # Execute the function via POST request
    response = httpx.post(endpoint, headers=headers, json=data)
    if response.status_code == 200:
        results = response.json()
        return [str(r) for r in results]
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return ["error in RAG"]

# Initialize the SentenceTransformer model
model = SentenceTransformer(model_name)


In [None]:
import re
import json

"""
    Extracts JSON or SQL code snippets from the provided text.
"""
def fetch_json_subparts(text):
        # This regex captures content between the outermost curly braces
        match = re.search(r'({.*})', text, re.DOTALL)
        if match:
            try:
                json_obj = json.loads(match.group(1))
                return json_obj
            except json.JSONDecodeError as e:
                print(f"ERROR in JSON ! {e}")
                return "error"
        # If not JSON, try to get SQL code
        markdown_match = re.search(r'```json(.*?)```', text, re.DOTALL)
        try:
            json_obj = json.loads(markdown_match.group(1))
            return json_obj
        except json.JSONDecodeError as e:
            print(f"ERROR in JSON ! {e}")
            return "error"

In [None]:
"""
    Continues a conversation with GPT-4 using the provided prompt.
"""
def continue_conversation(prompt):
    response = openai.ChatCompletion.create( model="gpt-4-0314", top_p=0,
                                        messages=prompt,
                                        temperature=0 )
    return response['choices'][0]['message']['content'].strip()


"""
    Check the factual accuracy of a query using OpenAI's GPT-4 with context from retrieved documents.

    :param query: The query or statement to be checked.
    :param documents: A list of documents that provide context for the query.
    :param openai_api_key: Your OpenAI API key.
    :return: The model's response regarding the factual accuracy of the query.
    """
def check_fact_with_context(query, documents, openai_api_key):
    
    openai.api_key = openai_api_key

    # Prepare the context by concatenating document contents
    context = ' /n '.join(documents)  # You might need to adjust this based on how your documents are structured

    # Formulate the prompt for GPT-4
    prompt = f"Based on the following information: {context}\nCan you tell me if this statement is true or false? '{query}'"
    prompt = f""" # Your Job is to work as Fact checkers and carefuly understand given Source Text as ground truth information and query as claim or text needs to validate.
                ## You should not use information outside of given Source Text and so step by step analysis in your mind and **strictly** provide output in following json format:
                     ## Carefully evaluate info even for minute details
                     ## Rule: You are not allowed to used accurate or inaccurate in classification if you dont knopw the reference where source fact lies
                     f"##Query / info to validate : '{query}'. " \
                     f"## Source Text: '{context}'. " \

                    # Expected JSON output format:
                     ```json
                        {{
                        classfication : "label" -  3 options for lables "accurate", "inaccurate" and "information not found in source" This can be classified only after reason
                        reason : "any supported reasoning based Source Text else say N/A"
                        reference: " each document from Source text will have text_book and page_number info so provide that in string. if its label 3 info not found then provide N/A "

                        }}
                     ```

                     """

    history = [
            {"role": "system", "content": "You are Fact check evaluator!" },
            {"role": "user", "content": prompt}]

    response = continue_conversation(history)
    return response

In [None]:

## ## Test the fact-checking process with examples
test_examples = [
    ["Deductive reasoning is a form of illogical thinking that uses unrelated observations to arrive at a specific conclusion. This type of reasoning is common in descriptive science.", "inaccurate"],
]

In [None]:
for i, example in enumerate(test_examples):
    query = example[0]
    documents = query_supabase_with_embedding(query, model)

    result = check_fact_with_context(query, documents, openai_api_key)
    print("* "*40)
    print("example No :", i+1)
    print(query)
    print(fetch_json_subparts(result))
    print("expected output : " , example[1])