In [None]:
!git clone https://github.com/Arun-Raghav-S/Advanced_RAG.git

In [1]:
cd Advanced_RAG

/kaggle/working/Advanced_RAG


In [None]:
%pip install -r requirements.txt 

In [5]:
import os
import nest_asyncio
from dotenv import load_dotenv
load_dotenv()


nest_asyncio.apply()

In [6]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
a = user_secrets.get_secret("AZURE_OPENAI_API_KEY")
b = user_secrets.get_secret("AZURE_OPENAI_ENDPOINT")
c = user_secrets.get_secret("OPENAI_API_VERSION")
d = user_secrets.get_secret("PAT_KEY")

In [7]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import SimpleDirectoryReader
# Initialize an embedding model from Hugging Face using the "BAAI/bge-small-en" model.
embedding_model = AzureOpenAIEmbedding(
    api_key=a,
    model="text-embedding-3-large",
    deployment_name="text-embedding3",
     azure_endpoint=b,
    api_version=c
    
)
llm = AzureOpenAI(
    model="gpt-35-turbo-16k",
    deployment_name="GPT35-turboA",
    api_key=a,
    azure_endpoint=b,
    api_version=c,
)


In [8]:
source_docs=SimpleDirectoryReader('test_data').load_data()

# Extracting tables,text and images

In [None]:
!mkdir images

In [None]:
import tempfile
def clear_temp_files():
    temp_dir = tempfile.gettempdir()
    for file in os.listdir(temp_dir):
        file_path = os.path.join(temp_dir, file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(f"Error clearing temp files: {e}")

clear_temp_files()

In [None]:
!apt install -y poppler-utils

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
def process_pdfs_in_folder(folder_path, output_dir):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing {pdf_path}...")

            pdf_name = os.path.splitext(filename)[0]
            pdf_output_dir = os.path.join(output_dir, pdf_name)

            # Create subdirectories for the PDF's images and JSON
            images_dir = os.path.join(pdf_output_dir, "images")
            json_dir = os.path.join(pdf_output_dir, "json")

            if not os.path.exists(images_dir):
                os.makedirs(images_dir)
            if not os.path.exists(json_dir):
                os.makedirs(json_dir)

            raw_pdf_elements = partition_pdf(
                filename=pdf_path,
                extract_images_in_pdf=True,
                infer_table_structure=True,
                chunking_strategy="by_title",
                max_characters=4000,
                new_after_n_chars=3800,
                combine_text_under_n_chars=2000,
                extract_image_block_output_dir=images_dir,
                extract_image_block_to_payload=False,
                strategy="hi_res"
            )

            json_filename = f"{pdf_name}.json"
            json_path = os.path.join(json_dir, json_filename)

            elements_to_json(raw_pdf_elements, filename=json_path)
            print(f"Saved data to {json_path}")

In [None]:
  source_folder = "test_data"  # Path to your source_docs folder
  output_folder = "extracted_jsons"  # Path to the folder where JSON files will be saved

  process_pdfs_in_folder(source_folder, output_folder)

In [36]:
import shutil
import os

def create_zip_from_folder(folder_path, zip_filename):
    # Create a zip file from the folder
    shutil.make_archive(zip_filename, 'zip', folder_path)
    print(f"Created zip file: {zip_filename}.zip")

extracted_folder = "chroma_db"  # Path to the folder to be zipped
zip_filename = "chroma_db_archive"  # Name of the resulting zip file (without extension)

create_zip_from_folder(extracted_folder, zip_filename)


Created zip file: chroma_db_archive.zip


# Preprocessing

## Tables and Text

### Preprocessing Helpers

**Display Table and Text**

In [None]:
from io import StringIO
import pandas as pd
import json
import re
from bs4 import BeautifulSoup
def clean_html_content(html_content):
    """Extract only the well-formed HTML table parts from the provided HTML content."""
    # This regex attempts to isolate <table>...</table> blocks
    if '<table' not in html_content:
    # Find all pieces of table rows or cells
        pieces = re.findall(r'<tr.*?>.*?</tr>', html_content, flags=re.DOTALL)
        if pieces:
            # Reconstruct HTML with <table> tags properly placed
            start = html_content.find(pieces[0])
            end = html_content.rfind(pieces[-1]) + len(pieces[-1])
            html_content = (html_content[:start] +
                            '<table>' +
                            html_content[start:end] +
                            '</table>' +
                            html_content[end:])
    return ''.join(re.findall(r'<table.*?>.*?</table>', html_content, flags=re.DOTALL))

def preprocess_json_file(input_filepath):
    with open(input_filepath, 'r') as file:
        data = json.load(file)

    preprocessed_elements = []
    for entry in data:
        if entry['type'] == 'CompositeElement':
            preprocessed_elements.append({
                'type': 'text',
                'content': entry['text']
            })
        if entry['type'] == 'Table':
            html_content = entry['metadata']['text_as_html']
            html_content=str(BeautifulSoup(html_content, 'html.parser'))
            cleaned_html=clean_html_content(html_content)
            soup = BeautifulSoup(cleaned_html, 'html.parser')
            table_html = str(soup)
            df = pd.read_html(StringIO(table_html))[0]
            # Check if the DataFrame has unique columns, if not, assign unique names
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = ['_'.join(map(str, col)).strip() for col in df.columns.values]
            elif df.columns.astype(str)[0].isdigit(): 
                df.columns = [f'Column_{i+1}' for i in range(len(df.columns))]
            table_json = json.loads(df.to_json(orient='records'))
            preprocessed_elements.append({
                'type': 'table',
                'content': table_json
            })

    return preprocessed_elements
def preprocess_all_json_files(base_folder_path):
    preprocessed_data = {}
    for pdf_name in os.listdir(base_folder_path):
        pdf_folder_path = os.path.join(base_folder_path, pdf_name, 'json')
        if os.path.isdir(pdf_folder_path):
            for file in os.listdir(pdf_folder_path):
                if file.endswith('.json'):
                    input_filepath = os.path.join(pdf_folder_path, file)
                    print(f"Processing {input_filepath}...")
                    preprocessed_data[pdf_name] = preprocess_json_file(input_filepath)
    return preprocessed_data

def combine_elements(preprocessed_data):
    combined_data = {}
    for pdf_name, elements in preprocessed_data.items():
        combined_text = ""
        for element in elements:
            if element['type'] == 'text':
                combined_text += element['content'] + "\n\n"
            elif element['type'] == 'table':
                combined_text += "Table:\n" + element['content'] + "\n\n"
        combined_data[pdf_name] = combined_text
    return combined_data

In [None]:
base_folder = "extracted_jsons"  # Path to your base folder containing PDF directories
preprocessed_data = preprocess_all_json_files(base_folder)
with open('processed_tableandtext.json', 'w', encoding='utf-8') as outfile:
    json.dump(preprocessed_data, outfile, indent=4, ensure_ascii=False)
# combined_data = combine_elements(preprocessed_data)

# # Optionally, save combined data to a file for later use
# with open('combined_data.json', 'w') as outfile:
#     json.dump(combined_data, outfile, indent=4)

### Table Text summaries

In [None]:
import json
import openai

def format_table(table_data):
    # Create a string representation of the table
    headers = table_data[0].keys()
    header_line = " | ".join(headers)
    lines = [header_line, "-"*len(header_line)]
    for item in table_data:
        row = " | ".join(str(item[h]) for h in headers)
        lines.append(row)
    return "\n".join(lines)

def summarize_content(content, content_type):
    # Define the prompt based on the content type
    if content_type == "text":
        prompt = "Summarize the following text: \n\n" + content
    elif content_type == "table":
        table_string = format_table(content)
        prompt = "Summarize the key information from this table: \n\n" + table_string
    else:
        return "No summary available."

    # Call the OpenAI API to generate the summary
    response = llm.complete(
    prompt
    )
    return response.text

def generate_summaries(file_path):
    # Load the JSON data
    with open(file_path, 'r') as file:
        data = json.load(file)

    summary_data = {}
    # Iterate through each document
    for pdf_name, contents in data.items():
        print("Processing pdf :",pdf_name)
        print()
        summary_data[pdf_name] = {}
        # Summarize each content type
        for content_dict in contents:  # Adjusted to iterate over the list
            content_type = content_dict['type']
            content = content_dict['content']
            summary = summarize_content(content, content_type)
            summary_data[pdf_name][content_type + "_summary"] = summary

    return summary_data

In [None]:
# Usage
summaries = generate_summaries('processed_tableandtext.json')
print(summaries)

### Summarising with  langchain

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
model = AzureChatOpenAI(
        deployment_name="GPT35-turboA",
        api_version="2024-02-01",
        temperature=0
      )

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

prompt_text = """
  You are responsible for concisely summarizing table or text chunk:
  Keep the summary short and crisp and extract key features

  {element}
"""
prompt = ChatPromptTemplate.from_template(prompt_text)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
import time
def process_pdf_contents(pdf_name, contents, summarize_chain):
    summary_results = {}
    print("Processing pdf:", pdf_name)

    try:
        tables = [c['content'] for c in contents if c['type'] == 'table']
        texts = [c['content'] for c in contents if c['type'] == 'text']

        # Process tables
        if tables:
            table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
            summary_results['table_summaries'] = table_summaries
            print("Table Summaries:")
            print(table_summaries)

        # Process texts
        if texts:
            text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
            summary_results['text_summaries'] = text_summaries
            print('Text Summaries:')
            print(text_summaries)

    except Exception as e:
        print(f"Error processing {pdf_name}: {e}")
        time.sleep(10)  # Sleep to respect rate limits or handle transient issues

    return summary_results

def load_and_summarize(file_path):
    # Load existing data if available
    if os.path.exists('intermediate_summary_results.json'):
        with open('intermediate_summary_results.json', 'r') as infile:
            all_results = json.load(infile)
    else:
        all_results = {}

    with open(file_path, 'r') as file:
        data = json.load(file)

    for pdf_name, contents in data.items():
        if pdf_name not in all_results:
            result = process_pdf_contents(pdf_name, contents, summarize_chain)
            all_results[pdf_name] = result
            # Save intermediate results to avoid losing progress
            with open('intermediate_summary_results.json', 'w') as outfile:
                json.dump(all_results, outfile, indent=4)

    return all_results

In [None]:
res=load_and_summarize('processed_tableandtext.json')
print(res)

In [None]:
with open('summarized_data.json', 'w') as file:
        json.dump(res, file, indent=4)

## Processing Images

### Preprocessing Helpers

**Verifying images**

In [None]:
from PIL import Image
import os
def verify_images(directory):
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)
        try:
            with Image.open(path) as img:
                print(f"{filename} is valid.")
        except IOError:
            print(f"Error opening {filename}; it may be corrupted or in an incorrect format.")

# Verify images before processing
verify_images('images')

**Display images**

In [None]:
import os
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd

# Directory containing the extracted images
images_path = "images"

# Function to display images
def plot_images(images_folder):
    # Check if the directory exists
    if not os.path.exists(images_folder):
        print(f"The directory {images_folder} does not exist.")
        return

    # List all image files
    image_files = [f for f in os.listdir(images_folder) if f.endswith(('.png', '.jpg', '.jpeg', '.ppm'))]
    total_files = len(image_files)

    if total_files == 0:
        print("No images found in the directory.")
        return

    print(f"Found {total_files} images.")
    
    # Plot each image
    fig, axs = plt.subplots(1, total_files, figsize=(15, 5))
    for ax, image_file in zip(axs, image_files):
        image_path = os.path.join(images_folder, image_file)
        image = Image.open(image_path)
        ax.imshow(image)
        ax.axis('off')
        ax.set_title(image_file)
    plt.show()


**Clear files**

In [None]:
def clear_file_if_not_empty(file_path):
    try:
        # Check if the file exists
        with open(file_path, 'r+') as file:
            contents = file.read()
            # Check if the file is not empty
            if contents:
                # Move the cursor to the beginning of the file
                file.seek(0)
                # Clear the file
                file.truncate()
                print("File was not empty and has been cleared.")
            else:
                print("File is already empty.")
    except FileNotFoundError:
        print(f"No file found at {file_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Specify the path to your file
file_path = 'captions_output.txt'

# Call the function to check and clear the file
clear_file_if_not_empty(file_path)


### Processing with MILVLG

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
torch.set_default_device("cuda")

In [None]:

torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    "MILVLG/imp-v1-3b",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("MILVLG/imp-v1-3b", trust_remote_code=True)

In [None]:
def process_and_caption_image(image_path):
    try:
        # Ensure image is loaded correctly
        image = Image.open(image_path).convert('RGB')
        text = (
            "Please analyze the image thoroughly. The image may contain various forms of data representation such as charts, graphs, tables, etc. "
            "For charts and graphs, identify and describe the type, axes, labels, data points, trends, and any significant peaks, troughs, or patterns. "
            "Highlight any anomalies or outliers and discuss their possible implications. Extract and report all key numerical values. "
            "For other types of images, describe all visible elements and their relationships in detail. Provide a clear and precise summary of the key features,"
            "interpret the data where applicable, and make note of any notable observations or ambiguities." 
            "It is crucial to extract all key numerical values if present in the image."

        )
        input_ids = tokenizer(text, return_tensors='pt').input_ids
        image_tensor = model.image_preprocess(image)

        

        # Generate the answer
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_new_tokens=100,
                images=image_tensor,
                use_cache=True
            )[0]
        
        return [tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()]
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return []

In [None]:
import os
import json
def process_images_in_folder(base_folder_path):
    structured_data = {}

    for pdf_name in os.listdir(base_folder_path):
        pdf_folder_path = os.path.join(base_folder_path, pdf_name, 'images')
        if os.path.isdir(pdf_folder_path):
            image_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            image_captions = {}
            print(f"Processing pdf : {pdf_name} \n")
            for filename in image_files:
                image_path = os.path.join(pdf_folder_path, filename)
                
                captions = process_and_caption_image(image_path)
                image_captions[filename] = captions

            structured_data[pdf_name] = image_captions

    return structured_data

base_folder = "extracted_jsons"  # Path to your base folder containing PDF directories
question = "Describe the image in detail and extract key features from it"
structured_image_data = process_images_in_folder(base_folder)


In [None]:
# Optionally, save structured image data to a JSON file for later use
with open('structured_image_data.json', 'w') as outfile:
    json.dump(structured_image_data, outfile, indent=4)

print(f"Image captions have been saved to structured_image_data.json")


In [None]:
structured_image_data

# Retriever Langchain

In [None]:
import uuid
import json

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

embeddings_model = AzureOpenAIEmbeddings(
        azure_deployment="text-embedding3",
        api_version="2024-02-01"
    )
# Load the JSON file containing summaries
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Initialize the retriever
id_key = "doc_id"
retriever = MultiVectorRetriever(
    vectorstore=Chroma(collection_name="summaries", embedding_function=embeddings_model),
    docstore=InMemoryStore(),
    id_key=id_key,
)



In [None]:
retriever.vectorstore.clear()
retriever.docstore.clear()

In [None]:

def store_summaries(original_data, summary_data):
    for pdf_name, contents in original_data.items():
        print(f"Processing PDF: {pdf_name}")
        
        # Extract original text and table contents
        text_contents = [c['content'] for c in contents if c['type'] == 'text']
        table_contents = [c['content'] for c in contents if c['type'] == 'table']
#         print(text_contents[1])
          

        # Store and print text summaries along with their corresponding original texts
        if 'text_summaries' in summary_data[pdf_name]:
            text_ids = [str(uuid.uuid4()) for _ in summary_data[pdf_name]['text_summaries']]
            print("text Ids:",text_ids)
            for i, summary in enumerate(summary_data[pdf_name]['text_summaries']):
                
                summary_texts = Document(page_content=summary, metadata={id_key: text_ids[i]})
                retriever.vectorstore.add_documents([summary_texts])
                retriever.docstore.mset([(text_ids[i], text_contents[i])])
        
        # Store and print table summaries along with their corresponding original tables
        if 'table_summaries' in summary_data[pdf_name]:
            table_ids = [str(uuid.uuid4()) for _ in summary_data[pdf_name]['table_summaries']]
            for i, summary in enumerate(summary_data[pdf_name]['table_summaries']):                
                summary_tables = Document(page_content=summary, metadata={id_key: table_ids[i]})
                retriever.vectorstore.add_documents([summary_tables])
                retriever.docstore.mset([(table_ids[i], table_contents[i])])

# Proceed with data loading and summarizing
original_data = load_data('processed_tableandtext.json')
summary_data = load_data('summarized_data.json')
store_summaries(original_data, summary_data)


In [None]:
import json
import os

import base64

def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def load_and_process_images(pdf_name, image_data, base_dir):
    """Load image files, convert them to base64, and collect summaries."""
    image_paths = [os.path.join(base_dir, pdf_name, 'images', image_name) for image_name in image_data.keys()]
    image_summaries = [summary[0] for summary in image_data.values()]
    image_contents = [image_to_base64(path) for path in image_paths]

    # Print each image path, a snippet of its base64 content, and its summary for verification
    for path, content, summary in zip(image_paths, image_contents, image_summaries):
        print(f"Image Path: {path}")
        print(f"Base64 Snippet: {content[:60]}...")  # Print only the first 60 characters of the base64 string
        print(f"Summary: {summary}")
        print("\n-------------------\n")

    return image_paths, image_summaries, image_contents

def store_image_data(image_paths, image_summaries, image_contents):
    doc_ids = [str(uuid.uuid4()) for _ in image_paths]
    summary_images = [
        Document(page_content=summary, metadata={id_key: doc_id})
        for doc_id, summary in zip(doc_ids, image_summaries)
    ]
    # Add summaries to the vector store
    retriever.vectorstore.add_documents(summary_images)
    # Store original images in the document store
    retriever.docstore.mset(list(zip(doc_ids, image_contents)))

# Example usage
base_dir = 'extracted_jsons'
structured_image_file = 'structured_image_data.json'
image_data_full = json.load(open(structured_image_file))

# Process and store each PDF's images
for pdf_name, images in image_data_full.items():
    image_paths, image_summaries, image_contents = load_and_process_images(pdf_name, images, base_dir)
    


In [None]:
import base64
from PIL import Image
from IPython.display import HTML, display
import io
from langchain.schema import Document, HumanMessage

def plt_img_base64(img_base64):
    """Display an image from a base64 encoded string."""
    display(HTML(f'<img src="data:image/jpeg;base64,{img_base64}" />'))

def is_image_data(b64data):
    """Check if the base64 data is an image by looking at the start of the data."""
    image_signatures = {
        b"\xFF\xD8\xFF": "jpg",
        b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

def split_image_text_types(docs):
    """Split base64-encoded images and texts."""
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content

        if is_image_data(doc):
            b64_images.append(doc)
        else:
            texts.append(doc)
    return {"images": b64_images, "texts": texts}

def img_prompt_func(data_dict):
    """Generate a text prompt incorporating image descriptions or placeholders."""
    messages = []

    # Process images: Since GPT-3 cannot directly handle images, use descriptions or placeholders
    if "images" in data_dict["context"]:
        for image in data_dict["context"]["images"]:
            description = f"[Image: A detailed description or a URL pointing to the image data.]"  # Placeholder text
            messages.append(description)

    # Process texts: Handle complex structures
    if "texts" in data_dict["context"]:
        formatted_texts = []
        for text_block in data_dict["context"]["texts"]:
            if isinstance(text_block, list):
                for text in text_block:
                    # Check if the item is a dictionary and format it
                    if isinstance(text, dict):
                        text_description = ' | '.join(f"{key}: {value}" for key, value in text.items() if value is not None)
                        formatted_texts.append(text_description)
                    elif isinstance(text, str):
                        # Directly append the string if it's not a dictionary
                        formatted_texts.append(text)
            elif isinstance(text_block, str):
                # Handle the case where text_block is directly a string
                formatted_texts.append(text_block)

        formatted_text_string = "\n".join(formatted_texts)
        messages.append(formatted_text_string)

    # Combine all messages into a single string with the question
    prompt = (
        f"You are a research analyst. You should provide precise answers to each question\n"
        f"Question: {data_dict['question']}\n\n"
        "Details:\n"
        + "\n".join(messages)
    )
    return prompt

In [None]:
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

# RAG pipeline
chain = (
    {
        "context": retriever | RunnableLambda(split_image_text_types),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(img_prompt_func)
    | model
    | StrOutputParser()
)

In [None]:
query='Who are the authors of "GNN_DEEPRL"?'

In [None]:
chain.invoke(query)

In [None]:
docs = retriever.get_relevant_documents(query)
len(docs)

In [None]:
docs

# Retriever llama index

In [13]:
import json
from tqdm import tqdm  # 
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load the JSON data from files
with open('processed_tableandtext.json', 'r') as file:
    table_and_text_data = json.load(file)

with open('summarized_data.json', 'r') as file:
    summarized_data = json.load(file)

with open('structured_image_data.json', 'r') as file:
    structured_image_data = json.load(file)

In [14]:
def normalize_title(title):
    return title.lower().strip().replace('.pdf', '')

# Normalize the keys in the data dictionaries
normalized_table_and_text_data = {normalize_title(key): value for key, value in table_and_text_data.items()}
normalized_summarized_data = {normalize_title(key): value for key, value in summarized_data.items()}
normalized_structured_image_data = {normalize_title(key): value for key, value in structured_image_data.items()}

In [23]:
source_docs = SimpleDirectoryReader('test_data').load_data()

# Create a node parser with desired settings
baseline_parser = SimpleNodeParser.from_defaults(
    chunk_overlap=200,
    chunk_size=1024
)

# Extract nodes from the documents with progress monitoring
print("Parsing documents into nodes...")
baseline_nodes = []
for doc in tqdm(source_docs, desc="Parsing documents"):
    nodes = baseline_parser.get_nodes_from_documents([doc])
    for node in nodes:
        # Initialize extra_info if it doesn't exist
        if not hasattr(node, 'extra_info'):
            node.extra_info = {'document_title': doc.metadata.get('title', 'Unknown')}
    baseline_nodes.extend(nodes)
print("Parsing complete.")


  m = regex.search(name + tok)


Parsing documents into nodes...


Parsing documents: 100%|██████████| 179/179 [00:00<00:00, 229.00it/s]

Parsing complete.





In [25]:
from tqdm import tqdm  # Import tqdm for progress monitoring

def normalize_title(title):
    return title.lower().strip().replace('.pdf', '')

def extend_nodes_with_summaries(nodes, normalized_table_and_text_data, normalized_summarized_data, normalized_structured_image_data):
    extended_nodes = []
    for node in tqdm(nodes, desc="Extending nodes"):
        # Normalize title for comparison
        title = normalize_title(node.metadata['file_name'])  # Assuming the title is in metadata
        print(f"Processing node for title: {title}")

        # Handling table content
        if title in normalized_table_and_text_data:
            table_content = [item['content'] for item in normalized_table_and_text_data[title] if item['type'] == 'table']
            node.metadata['table_content'] = table_content
            print(f"Added table content for title: {title}")
        else:
            print(f"Title {title} not found in normalized_table_and_text_data")

        # Handling summarized data for tables and texts
        if title in normalized_summarized_data:
            table_summaries = normalized_summarized_data[title].get('table_summaries', [])
            text_summaries = normalized_summarized_data[title].get('text_summaries', [])
            node.metadata['table_summaries'] = table_summaries
            node.metadata['text_summaries'] = text_summaries
            if table_summaries:
                print(f"Added table summaries for title: {title}")
            if text_summaries:
                print(f"Added text summaries for title: {title}")
        else:
            print(f"Title {title} not found in normalized_summarized_data")

        # Handling summarized data for images
        if title in normalized_structured_image_data:
            image_summaries = normalized_structured_image_data[title]
            node.metadata['image_summaries'] = image_summaries
            print(f"Added image summaries for title: {title}")
        else:
            print(f"Title {title} not found in normalized_structured_image_data")

        extended_nodes.append(node)
    
    return extended_nodes


In [26]:
extended_baseline_nodes = extend_nodes_with_summaries(baseline_nodes, normalized_table_and_text_data, normalized_summarized_data, normalized_structured_image_data)

Extending nodes: 100%|██████████| 299/299 [00:00<00:00, 21105.28it/s]

Processing node for title: deepfake
Added table content for title: deepfake
Added table summaries for title: deepfake
Added text summaries for title: deepfake
Added image summaries for title: deepfake
Processing node for title: deepfake
Added table content for title: deepfake
Added table summaries for title: deepfake
Added text summaries for title: deepfake
Added image summaries for title: deepfake
Processing node for title: deepfake
Added table content for title: deepfake
Added table summaries for title: deepfake
Added text summaries for title: deepfake
Added image summaries for title: deepfake
Processing node for title: deepfake
Added table content for title: deepfake
Added table summaries for title: deepfake
Added text summaries for title: deepfake
Added image summaries for title: deepfake
Processing node for title: deepfake
Added table content for title: deepfake
Added table summaries for title: deepfake
Added text summaries for title: deepfake
Added image summaries for title: deep




In [None]:
for node in extended_baseline_nodes[:3]:  # Check the first 3 nodes for demonstration
    print(f"Title: {node.metadata['file_name']}")
    print(f"Table Content: {node.metadata['table_summaries']}")
    print(f"Table Summaries: {node.metadata['table_summaries']}")
    print(f"Text Summaries: {node.metadata['text_summaries']}")
    print(f"Image Summaries: {node.metadata['image_summaries']}")
    print("\n")

In [38]:
from llama_index.core import Settings
Settings.llm=llm
Settings.embed_model=embedding_model

In [34]:
import time
import chromadb
from llama_index.core import StorageContext

# Retry mechanism for handling rate limits


def create_and_persist_index():
    # Create the Chroma client and collection using a persistent client
    print("Initializing Chroma persistent client and collection...")
    db = chromadb.PersistentClient(path="./chroma_db")  # Define the storage path
    chroma_collection = db.get_or_create_collection("baseline_indexNew")
    
    print(f"Collection '{chroma_collection.name}' ready for use.")

    # Create the Chroma vector store
    print("Initializing ChromaVectorStore...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    print("ChromaVectorStore initialized with collection_name 'baseline_indexNew'")

    # Create the index
    print("Creating VectorStoreIndex...")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    baseline_index = VectorStoreIndex.from_documents(
        documents=source_docs,
        storage_context=storage_context,
        embed_model=embedding_model
    )
    print("VectorStoreIndex created")

    # Persist the index
    print("Persisting VectorStoreIndex...")
    baseline_index.storage_context.persist()
    print("VectorStoreIndex persisted successfully.")
    return baseline_index

baseline_index = retry_with_backoff(create_and_persist_index)
print("Index creation and persistence complete.")


Creating and persisting index with retries...
Initializing Chroma persistent client and collection...
Collection 'baseline_indexNew' ready for use.
Initializing ChromaVectorStore...
ChromaVectorStore initialized with collection_name 'baseline_indexNew'
Creating VectorStoreIndex...
VectorStoreIndex created
Persisting VectorStoreIndex...
VectorStoreIndex persisted successfully.
Index creation and persistence complete.


In [37]:
print("Converting index to query engine...")
baseline_query_engine = baseline_index.as_query_engine(similarity_top_k=3)
print("Query engine ready.")

# Function to retrieve data based on a query
def retrieve_data(query):
    results = baseline_query_engine.query(query)
    return results

# Example query


Converting index to query engine...
Query engine ready.


In [None]:
query = 'what is multimodal condition in talking face generation?'
retrieved_data = retrieve_data(query)
print(retrieved_data)

In [None]:
def format_output(results):
    for result in results:
        node = result.node
        print("Document ID:", node.id_)
        print("File Name:", node.metadata['file_name'])
        print("File Type:", node.metadata['file_type'])
        print("Creation Date:", node.metadata['creation_date'])
        print("Page Label:", node.metadata['page_label'])
        print("File Path:", node.metadata['file_path'])
        print()
        print("Text Extract:")
        print(node.text[:300])  # Displaying first 300 characters for brevity
        print("-" * 80)

In [None]:
x=baseline_query_engine.retrieve(query)

In [None]:
x[0]

# Custom Retriever

In [25]:
import time
from tqdm import tqdm
import chromadb
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Node, NodeRelationship, RelatedNodeInfo
from llama_index.vector_stores.chroma import ChromaVectorStore
import time
import chromadb
import multiprocessing
from tqdm import tqdm
import logging
from functools import partial
def normalize_title(title):
    return title.lower().strip().replace('.pdf', '')

def process_document(doc, normalized_table_and_text_data, normalized_summarized_data, normalized_structured_image_data):
    all_nodes = []
    main_node = Node(
        text=doc.text,
        metadata={
            "file_name": doc.metadata.get("file_name"),
            "doc_id": doc.id_,
            "is_document": True
        }
    )
    all_nodes.append(main_node)

    title = normalize_title(doc.metadata.get("file_name", ""))
    print("PROCESSING FILE:",title)

    # Create nodes for table content
    if title in normalized_table_and_text_data:
        for item in normalized_table_and_text_data[title]:
            if item['type'] == 'table':
                table_node = Node(
                    text=f"Table Content: {item['content']}",
                    metadata={
                        "doc_id": doc.id_,
                        "type": "table_content"
                    }
                )
                all_nodes.append(table_node)
                main_node.relationships[NodeRelationship.CHILD] = main_node.relationships.get(NodeRelationship.CHILD, []) + [
                    RelatedNodeInfo(node_id=table_node.id_)
                ]

    # Create nodes for table summaries
    if title in normalized_summarized_data:
        for summary in normalized_summarized_data[title].get('table_summaries', []):
            table_summary_node = Node(
                text=f"Table Summary: {summary}",
                metadata={
                    "doc_id": doc.id_,
                    "type": "table_summary"
                }
            )
            all_nodes.append(table_summary_node)
            main_node.relationships[NodeRelationship.CHILD] = main_node.relationships.get(NodeRelationship.CHILD, []) + [
                RelatedNodeInfo(node_id=table_summary_node.id_)
            ]

    # Create nodes for text summaries
    if title in normalized_summarized_data:
        for summary in normalized_summarized_data[title].get('text_summaries', []):
            text_summary_node = Node(
                text=f"Text Summary: {summary}",
                metadata={
                    "doc_id": doc.id_,
                    "type": "text_summary"
                }
            )
            all_nodes.append(text_summary_node)
            main_node.relationships[NodeRelationship.CHILD] = main_node.relationships.get(NodeRelationship.CHILD, []) + [
                RelatedNodeInfo(node_id=text_summary_node.id_)
            ]

    # Create nodes for image summaries
    if title in normalized_structured_image_data:
        for summary in normalized_structured_image_data[title]:
            image_summary_node = Node(
                text=f"Image Summary: {summary}",
                metadata={
                    "doc_id": doc.id_,
                    "type": "image_summary"
                }
            )
            all_nodes.append(image_summary_node)
            main_node.relationships[NodeRelationship.CHILD] = main_node.relationships.get(NodeRelationship.CHILD, []) + [
                RelatedNodeInfo(node_id=image_summary_node.id_)
            ]

    return all_nodes

In [26]:
def create_and_persist_index_for_document(doc, vector_store, retry_delay=60, max_retries=5):
    for attempt in range(max_retries):
        try:
            nodes = process_document(doc, normalized_table_and_text_data, normalized_summarized_data, normalized_structured_image_data)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            index = VectorStoreIndex(nodes, storage_context=storage_context)
            return index
        except Exception as e:
            if "rate limit" in str(e).lower():
                if attempt < max_retries - 1:
                    logging.warning(f"Rate limit error. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
                    time.sleep(retry_delay)
                else:
                    logging.error(f"Max retries exceeded for document: {doc.metadata.get('file_name')}")
                    raise
            else:
                logging.error(f"Unexpected error: {e}")
                raise

In [34]:
def add_document_to_index(doc, index, retry_delay=60, max_retries=5):
    for attempt in range(max_retries):
        try:
            nodes = process_document(doc, normalized_table_and_text_data, normalized_summarized_data, normalized_structured_image_data)
            index.insert_nodes(nodes)
            return True
        except Exception as e:
            if "rate limit" in str(e).lower():
                if attempt < max_retries - 1:
                    logging.warning(f"Rate limit error. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
                    time.sleep(retry_delay)
                else:
                    logging.error(f"Max retries exceeded for document: {doc.metadata.get('file_name')}")
                    return False
            else:
                logging.error(f"Unexpected error: {e}")
                return False

def create_full_index(source_docs):
    logging.info("Creating Chroma client and collection")
    chroma_client = chromadb.EphemeralClient()
    chroma_collection = chroma_client.get_or_create_collection("advanced_rag_index")
    
    logging.info("Initializing vector store and storage context")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    logging.info("Creating empty index")
    index = VectorStoreIndex([], storage_context=storage_context.property_graph_store,embed_model=embedding_model)
    
    for doc in tqdm(source_docs, desc="Processing documents"):
        success = add_document_to_index(doc, index)
        if success:
            logging.info(f"Successfully processed document: {doc.metadata.get('file_name')}")
        else:
            logging.error(f"Failed to process document: {doc.metadata.get('file_name')}")
        time.sleep(5)  # Add a delay between documents to avoid rate limiting

    logging.info("Persisting final index")
    index.storage_context.persist()
    logging.info("Full index creation and persistence complete")

    return index


In [35]:
index = create_full_index(source_docs)
if index:
    print("Full index created successfully")
else:
    print("Failed to create full index")

Processing documents:   0%|          | 0/179 [00:00<?, ?it/s]

PROCESSING FILE: deepfake


Processing documents:   1%|          | 1/179 [00:17<52:54, 17.83s/it]

PROCESSING FILE: deepfake


Processing documents:   1%|          | 2/179 [00:36<53:32, 18.15s/it]

PROCESSING FILE: deepfake


Processing documents:   2%|▏         | 3/179 [01:20<1:27:38, 29.88s/it]

PROCESSING FILE: deepfake


Processing documents:   2%|▏         | 4/179 [01:38<1:14:09, 25.43s/it]

PROCESSING FILE: deepfake


Processing documents:   3%|▎         | 5/179 [01:57<1:06:33, 22.95s/it]

PROCESSING FILE: deepfake


Processing documents:   3%|▎         | 6/179 [02:15<1:01:33, 21.35s/it]

PROCESSING FILE: deepfake


Processing documents:   4%|▍         | 7/179 [02:33<58:25, 20.38s/it]  

PROCESSING FILE: deepfake


Processing documents:   4%|▍         | 8/179 [02:51<56:01, 19.66s/it]

PROCESSING FILE: deepfake


Processing documents:   5%|▌         | 9/179 [03:10<54:30, 19.24s/it]

PROCESSING FILE: deepfake


Processing documents:   6%|▌         | 10/179 [03:28<53:19, 18.93s/it]

PROCESSING FILE: deepfake


Processing documents:   6%|▌         | 11/179 [03:46<52:25, 18.72s/it]

PROCESSING FILE: deepfake


Processing documents:   7%|▋         | 12/179 [04:05<51:56, 18.66s/it]

PROCESSING FILE: deepfake


Processing documents:   7%|▋         | 13/179 [04:23<51:20, 18.56s/it]

PROCESSING FILE: deepfake


Processing documents:   8%|▊         | 14/179 [04:41<50:46, 18.47s/it]

PROCESSING FILE: deepfake


Processing documents:   8%|▊         | 15/179 [04:59<50:12, 18.37s/it]

PROCESSING FILE: deepfake


Processing documents:   9%|▉         | 16/179 [05:18<49:40, 18.29s/it]

PROCESSING FILE: deepfake


Processing documents:   9%|▉         | 17/179 [05:36<49:29, 18.33s/it]

PROCESSING FILE: deepfake


Processing documents:  10%|█         | 18/179 [05:54<49:18, 18.38s/it]

PROCESSING FILE: deepfake


Processing documents:  11%|█         | 19/179 [06:13<48:47, 18.30s/it]

PROCESSING FILE: deepfake


Processing documents:  11%|█         | 20/179 [06:31<48:44, 18.40s/it]

PROCESSING FILE: deepfake


Processing documents:  12%|█▏        | 21/179 [06:50<48:25, 18.39s/it]

PROCESSING FILE: deepfake


Processing documents:  12%|█▏        | 22/179 [07:08<48:13, 18.43s/it]

PROCESSING FILE: deepfake


Processing documents:  13%|█▎        | 23/179 [07:27<47:52, 18.41s/it]

PROCESSING FILE: deepfake


Processing documents:  13%|█▎        | 24/179 [07:45<47:32, 18.40s/it]

PROCESSING FILE: deepfake


Processing documents:  14%|█▍        | 25/179 [08:03<47:19, 18.44s/it]

PROCESSING FILE: deepfake


Processing documents:  15%|█▍        | 26/179 [08:22<47:01, 18.44s/it]

PROCESSING FILE: deepfake


Processing documents:  15%|█▌        | 27/179 [08:40<46:38, 18.41s/it]

PROCESSING FILE: deepfake


Processing documents:  16%|█▌        | 28/179 [08:58<46:13, 18.36s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  16%|█▌        | 29/179 [09:08<39:02, 15.62s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  17%|█▋        | 30/179 [09:17<33:51, 13.64s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  17%|█▋        | 31/179 [09:26<30:21, 12.31s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  18%|█▊        | 32/179 [09:35<27:51, 11.37s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  18%|█▊        | 33/179 [09:44<26:05, 10.72s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  19%|█▉        | 34/179 [09:53<24:46, 10.25s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  20%|█▉        | 35/179 [10:02<23:45,  9.90s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  20%|██        | 36/179 [10:12<23:03,  9.68s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  21%|██        | 37/179 [10:21<22:25,  9.48s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  21%|██        | 38/179 [10:30<22:04,  9.40s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  22%|██▏       | 39/179 [10:39<21:50,  9.36s/it]

PROCESSING FILE: fmri_vision_reconstruction


Processing documents:  22%|██▏       | 40/179 [10:48<21:32,  9.30s/it]

PROCESSING FILE: llama


Processing documents:  23%|██▎       | 41/179 [11:03<25:06, 10.92s/it]

PROCESSING FILE: llama


Processing documents:  23%|██▎       | 42/179 [11:18<27:41, 12.13s/it]

PROCESSING FILE: llama


Processing documents:  24%|██▍       | 43/179 [11:33<29:42, 13.11s/it]

PROCESSING FILE: llama


Processing documents:  25%|██▍       | 44/179 [11:49<30:55, 13.74s/it]

PROCESSING FILE: llama


Processing documents:  25%|██▌       | 45/179 [12:04<31:45, 14.22s/it]

PROCESSING FILE: llama


Processing documents:  26%|██▌       | 46/179 [12:19<32:07, 14.49s/it]

PROCESSING FILE: llama


Processing documents:  26%|██▋       | 47/179 [12:34<32:18, 14.69s/it]

PROCESSING FILE: llama


Processing documents:  27%|██▋       | 48/179 [12:49<32:22, 14.83s/it]

PROCESSING FILE: llama


Processing documents:  27%|██▋       | 49/179 [13:04<32:15, 14.89s/it]

PROCESSING FILE: llama


Processing documents:  28%|██▊       | 50/179 [13:19<32:05, 14.93s/it]

PROCESSING FILE: llama


Processing documents:  28%|██▊       | 51/179 [13:35<32:04, 15.04s/it]

PROCESSING FILE: llama


Processing documents:  29%|██▉       | 52/179 [13:50<31:47, 15.02s/it]

PROCESSING FILE: llama


Processing documents:  30%|██▉       | 53/179 [14:05<31:35, 15.04s/it]

PROCESSING FILE: llama


Processing documents:  30%|███       | 54/179 [14:20<31:25, 15.09s/it]

PROCESSING FILE: llama


Processing documents:  31%|███       | 55/179 [14:35<31:13, 15.11s/it]

PROCESSING FILE: llama


Processing documents:  31%|███▏      | 56/179 [14:50<31:05, 15.16s/it]

PROCESSING FILE: llama


Processing documents:  32%|███▏      | 57/179 [15:05<30:45, 15.12s/it]

PROCESSING FILE: llama


Processing documents:  32%|███▏      | 58/179 [15:20<30:26, 15.10s/it]

PROCESSING FILE: llama


Processing documents:  33%|███▎      | 59/179 [15:36<30:21, 15.18s/it]

PROCESSING FILE: llama


Processing documents:  34%|███▎      | 60/179 [15:51<30:07, 15.19s/it]

PROCESSING FILE: llama


Processing documents:  34%|███▍      | 61/179 [16:06<29:47, 15.15s/it]

PROCESSING FILE: llama


Processing documents:  35%|███▍      | 62/179 [16:21<29:27, 15.10s/it]

PROCESSING FILE: llama


Processing documents:  35%|███▌      | 63/179 [16:36<29:10, 15.09s/it]

PROCESSING FILE: llama


Processing documents:  36%|███▌      | 64/179 [16:51<28:54, 15.08s/it]

PROCESSING FILE: llama


Processing documents:  36%|███▋      | 65/179 [17:06<28:34, 15.04s/it]

PROCESSING FILE: llama


Processing documents:  37%|███▋      | 66/179 [17:21<28:25, 15.09s/it]

PROCESSING FILE: llama


Processing documents:  37%|███▋      | 67/179 [17:37<28:13, 15.12s/it]

PROCESSING FILE: qwen


Processing documents:  38%|███▊      | 68/179 [18:00<32:19, 17.47s/it]

PROCESSING FILE: qwen


Processing documents:  39%|███▊      | 69/179 [18:22<35:02, 19.11s/it]

PROCESSING FILE: qwen


Processing documents:  39%|███▉      | 70/179 [18:46<36:58, 20.36s/it]

PROCESSING FILE: qwen


Processing documents:  40%|███▉      | 71/179 [19:09<37:59, 21.11s/it]

PROCESSING FILE: qwen


Processing documents:  40%|████      | 72/179 [19:31<38:34, 21.63s/it]

PROCESSING FILE: qwen


Processing documents:  41%|████      | 73/179 [19:58<41:00, 23.22s/it]

PROCESSING FILE: qwen


Processing documents:  41%|████▏     | 74/179 [20:23<41:14, 23.56s/it]

PROCESSING FILE: qwen


Processing documents:  42%|████▏     | 75/179 [20:46<40:38, 23.45s/it]

PROCESSING FILE: qwen


Processing documents:  42%|████▏     | 76/179 [21:09<40:13, 23.43s/it]

PROCESSING FILE: qwen


Processing documents:  43%|████▎     | 77/179 [21:33<39:46, 23.40s/it]

PROCESSING FILE: qwen


Processing documents:  44%|████▎     | 78/179 [21:56<39:21, 23.38s/it]

PROCESSING FILE: qwen


Processing documents:  44%|████▍     | 79/179 [22:19<38:51, 23.32s/it]

PROCESSING FILE: qwen


Processing documents:  45%|████▍     | 80/179 [22:42<38:19, 23.22s/it]

PROCESSING FILE: qwen


Processing documents:  45%|████▌     | 81/179 [23:05<37:59, 23.26s/it]

PROCESSING FILE: qwen


Processing documents:  46%|████▌     | 82/179 [23:28<37:25, 23.15s/it]

PROCESSING FILE: qwen


Processing documents:  46%|████▋     | 83/179 [23:51<36:56, 23.08s/it]

PROCESSING FILE: qwen


Processing documents:  47%|████▋     | 84/179 [24:14<36:34, 23.10s/it]

PROCESSING FILE: qwen


Processing documents:  47%|████▋     | 85/179 [24:37<36:07, 23.06s/it]

PROCESSING FILE: qwen


Processing documents:  48%|████▊     | 86/179 [25:01<35:50, 23.13s/it]

PROCESSING FILE: qwen


Processing documents:  49%|████▊     | 87/179 [25:24<35:24, 23.09s/it]

PROCESSING FILE: qwen


Processing documents:  49%|████▉     | 88/179 [25:47<34:55, 23.03s/it]

PROCESSING FILE: qwen


Processing documents:  50%|████▉     | 89/179 [26:10<34:38, 23.10s/it]

PROCESSING FILE: qwen


Processing documents:  50%|█████     | 90/179 [26:33<34:21, 23.17s/it]

PROCESSING FILE: qwen


Processing documents:  51%|█████     | 91/179 [26:56<33:58, 23.17s/it]

PROCESSING FILE: qwen


Processing documents:  51%|█████▏    | 92/179 [27:20<33:42, 23.24s/it]

PROCESSING FILE: qwen


Processing documents:  52%|█████▏    | 93/179 [27:43<33:18, 23.23s/it]

PROCESSING FILE: qwen


Processing documents:  53%|█████▎    | 94/179 [28:06<32:49, 23.17s/it]

PROCESSING FILE: qwen


Processing documents:  53%|█████▎    | 95/179 [28:29<32:27, 23.19s/it]

PROCESSING FILE: qwen


Processing documents:  54%|█████▎    | 96/179 [28:53<32:09, 23.24s/it]

PROCESSING FILE: qwen


Processing documents:  54%|█████▍    | 97/179 [29:16<31:43, 23.22s/it]

PROCESSING FILE: qwen


Processing documents:  55%|█████▍    | 98/179 [29:39<31:17, 23.18s/it]

PROCESSING FILE: qwen


Processing documents:  55%|█████▌    | 99/179 [30:02<30:57, 23.22s/it]

PROCESSING FILE: qwen


Processing documents:  56%|█████▌    | 100/179 [30:29<32:08, 24.41s/it]

PROCESSING FILE: qwen


Processing documents:  56%|█████▋    | 101/179 [30:52<31:14, 24.03s/it]

PROCESSING FILE: qwen


Processing documents:  57%|█████▋    | 102/179 [31:15<30:24, 23.70s/it]

PROCESSING FILE: qwen


Processing documents:  58%|█████▊    | 103/179 [31:39<30:04, 23.75s/it]

PROCESSING FILE: qwen


Processing documents:  58%|█████▊    | 104/179 [32:02<29:25, 23.55s/it]

PROCESSING FILE: qwen


Processing documents:  59%|█████▊    | 105/179 [32:26<28:56, 23.46s/it]

PROCESSING FILE: qwen


Processing documents:  59%|█████▉    | 106/179 [32:51<29:15, 24.05s/it]

PROCESSING FILE: qwen


Processing documents:  60%|█████▉    | 107/179 [33:14<28:30, 23.76s/it]

PROCESSING FILE: qwen


Processing documents:  60%|██████    | 108/179 [33:37<27:55, 23.60s/it]

PROCESSING FILE: qwen


Processing documents:  61%|██████    | 109/179 [34:01<27:42, 23.74s/it]

PROCESSING FILE: qwen


Processing documents:  61%|██████▏   | 110/179 [34:25<27:09, 23.61s/it]

PROCESSING FILE: qwen


Processing documents:  62%|██████▏   | 111/179 [34:48<26:38, 23.51s/it]

PROCESSING FILE: qwen


Processing documents:  63%|██████▎   | 112/179 [35:11<26:09, 23.43s/it]

PROCESSING FILE: qwen


Processing documents:  63%|██████▎   | 113/179 [35:35<25:44, 23.40s/it]

PROCESSING FILE: qwen


Processing documents:  64%|██████▎   | 114/179 [35:58<25:15, 23.31s/it]

PROCESSING FILE: qwen


Processing documents:  64%|██████▍   | 115/179 [36:21<24:48, 23.27s/it]

PROCESSING FILE: qwen


Processing documents:  65%|██████▍   | 116/179 [36:44<24:25, 23.26s/it]

PROCESSING FILE: qwen


Processing documents:  65%|██████▌   | 117/179 [37:07<24:01, 23.24s/it]

PROCESSING FILE: qwen


Processing documents:  66%|██████▌   | 118/179 [37:31<23:50, 23.45s/it]

PROCESSING FILE: qwen


Processing documents:  66%|██████▋   | 119/179 [37:54<23:16, 23.28s/it]

PROCESSING FILE: qwen


Processing documents:  67%|██████▋   | 120/179 [38:17<22:46, 23.16s/it]

PROCESSING FILE: qwen


Processing documents:  68%|██████▊   | 121/179 [38:40<22:13, 23.00s/it]

PROCESSING FILE: qwen


Processing documents:  68%|██████▊   | 122/179 [39:03<22:04, 23.23s/it]

PROCESSING FILE: qwen


Processing documents:  69%|██████▊   | 123/179 [39:26<21:35, 23.14s/it]

PROCESSING FILE: qwen


Processing documents:  69%|██████▉   | 124/179 [39:49<21:13, 23.15s/it]

PROCESSING FILE: qwen


Processing documents:  70%|██████▉   | 125/179 [40:13<20:51, 23.18s/it]

PROCESSING FILE: qwen


Processing documents:  70%|███████   | 126/179 [40:36<20:27, 23.16s/it]

PROCESSING FILE: vision_instruction


Processing documents:  71%|███████   | 127/179 [40:48<17:15, 19.91s/it]

PROCESSING FILE: vision_instruction


Processing documents:  72%|███████▏  | 128/179 [41:00<14:59, 17.63s/it]

PROCESSING FILE: vision_instruction


Processing documents:  72%|███████▏  | 129/179 [41:13<13:18, 15.97s/it]

PROCESSING FILE: vision_instruction


Processing documents:  73%|███████▎  | 130/179 [41:25<12:06, 14.82s/it]

PROCESSING FILE: vision_instruction


Processing documents:  73%|███████▎  | 131/179 [41:37<11:14, 14.06s/it]

PROCESSING FILE: vision_instruction


Processing documents:  74%|███████▎  | 132/179 [41:49<10:37, 13.56s/it]

PROCESSING FILE: vision_instruction


Processing documents:  74%|███████▍  | 133/179 [42:02<10:05, 13.16s/it]

PROCESSING FILE: vision_instruction


Processing documents:  75%|███████▍  | 134/179 [42:14<09:39, 12.89s/it]

PROCESSING FILE: vision_instruction


Processing documents:  75%|███████▌  | 135/179 [42:26<09:20, 12.74s/it]

PROCESSING FILE: vision_instruction


Processing documents:  76%|███████▌  | 136/179 [42:39<09:01, 12.60s/it]

PROCESSING FILE: vision_instruction


Processing documents:  77%|███████▋  | 137/179 [42:51<08:45, 12.50s/it]

PROCESSING FILE: vision_instruction


Processing documents:  77%|███████▋  | 138/179 [43:03<08:30, 12.46s/it]

PROCESSING FILE: vision_instruction


Processing documents:  78%|███████▊  | 139/179 [43:16<08:23, 12.60s/it]

PROCESSING FILE: vision_instruction


Processing documents:  78%|███████▊  | 140/179 [43:28<08:07, 12.49s/it]

PROCESSING FILE: vision_instruction


Processing documents:  79%|███████▉  | 141/179 [43:40<07:50, 12.38s/it]

PROCESSING FILE: vision_instruction


Processing documents:  79%|███████▉  | 142/179 [43:52<07:34, 12.27s/it]

PROCESSING FILE: vision_instruction


Processing documents:  80%|███████▉  | 143/179 [44:05<07:22, 12.29s/it]

PROCESSING FILE: vision_instruction


Processing documents:  80%|████████  | 144/179 [44:17<07:07, 12.22s/it]

PROCESSING FILE: vision_instruction


Processing documents:  81%|████████  | 145/179 [44:29<06:54, 12.19s/it]

PROCESSING FILE: vision_instruction


Processing documents:  82%|████████▏ | 146/179 [44:41<06:40, 12.14s/it]

PROCESSING FILE: vision_instruction


Processing documents:  82%|████████▏ | 147/179 [44:53<06:28, 12.13s/it]

PROCESSING FILE: vision_instruction


Processing documents:  83%|████████▎ | 148/179 [45:05<06:15, 12.11s/it]

PROCESSING FILE: vision_instruction


Processing documents:  83%|████████▎ | 149/179 [45:17<06:04, 12.15s/it]

PROCESSING FILE: vision_instruction


Processing documents:  84%|████████▍ | 150/179 [45:29<05:51, 12.11s/it]

PROCESSING FILE: vision_instruction


Processing documents:  84%|████████▍ | 151/179 [45:42<05:40, 12.17s/it]

PROCESSING FILE: wisper


Processing documents:  85%|████████▍ | 152/179 [45:50<04:54, 10.90s/it]

PROCESSING FILE: wisper


Processing documents:  85%|████████▌ | 153/179 [45:58<04:21, 10.05s/it]

PROCESSING FILE: wisper


Processing documents:  86%|████████▌ | 154/179 [46:06<03:57,  9.50s/it]

PROCESSING FILE: wisper


Processing documents:  87%|████████▋ | 155/179 [46:14<03:38,  9.10s/it]

PROCESSING FILE: wisper


Processing documents:  87%|████████▋ | 156/179 [46:22<03:23,  8.83s/it]

PROCESSING FILE: wisper


Processing documents:  88%|████████▊ | 157/179 [46:31<03:10,  8.65s/it]

PROCESSING FILE: wisper


Processing documents:  88%|████████▊ | 158/179 [46:39<02:58,  8.50s/it]

PROCESSING FILE: wisper


Processing documents:  89%|████████▉ | 159/179 [46:51<03:09,  9.49s/it]

PROCESSING FILE: wisper


Processing documents:  89%|████████▉ | 160/179 [46:58<02:51,  9.03s/it]

PROCESSING FILE: wisper


Processing documents:  90%|████████▉ | 161/179 [47:06<02:36,  8.72s/it]

PROCESSING FILE: wisper


Processing documents:  91%|█████████ | 162/179 [47:14<02:24,  8.50s/it]

PROCESSING FILE: wisper


Processing documents:  91%|█████████ | 163/179 [47:23<02:14,  8.40s/it]

PROCESSING FILE: wisper


Processing documents:  92%|█████████▏| 164/179 [47:31<02:05,  8.34s/it]

PROCESSING FILE: wisper


Processing documents:  92%|█████████▏| 165/179 [47:39<01:56,  8.30s/it]

PROCESSING FILE: wisper


Processing documents:  93%|█████████▎| 166/179 [47:52<02:05,  9.66s/it]

PROCESSING FILE: wisper


Processing documents:  93%|█████████▎| 167/179 [48:00<01:50,  9.17s/it]

PROCESSING FILE: wisper


Processing documents:  94%|█████████▍| 168/179 [48:08<01:37,  8.82s/it]

PROCESSING FILE: wisper


Processing documents:  94%|█████████▍| 169/179 [48:16<01:25,  8.58s/it]

PROCESSING FILE: wisper


Processing documents:  95%|█████████▍| 170/179 [48:24<01:16,  8.46s/it]

PROCESSING FILE: wisper


Processing documents:  96%|█████████▌| 171/179 [48:32<01:06,  8.37s/it]

PROCESSING FILE: wisper


Processing documents:  96%|█████████▌| 172/179 [48:40<00:57,  8.25s/it]

PROCESSING FILE: wisper


Processing documents:  97%|█████████▋| 173/179 [48:49<00:49,  8.27s/it]

PROCESSING FILE: wisper


Processing documents:  97%|█████████▋| 174/179 [48:59<00:45,  9.03s/it]

PROCESSING FILE: wisper


Processing documents:  98%|█████████▊| 175/179 [49:08<00:35,  8.79s/it]

PROCESSING FILE: wisper


Processing documents:  98%|█████████▊| 176/179 [49:16<00:25,  8.63s/it]

PROCESSING FILE: wisper


Processing documents:  99%|█████████▉| 177/179 [49:24<00:16,  8.49s/it]

PROCESSING FILE: wisper


Processing documents:  99%|█████████▉| 178/179 [49:32<00:08,  8.41s/it]

PROCESSING FILE: wisper


Processing documents: 100%|██████████| 179/179 [49:40<00:00, 16.65s/it]


Full index created successfully


In [36]:
print(index)

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7cf7d3e59e70>


In [50]:
qe = index.as_query_engine(similarity_top_k=3)

In [52]:
y=qe.query("what is multimodal condition in talking face generation?")

In [54]:
y

Response(response='Multimodal condition in talking face generation involves introducing additional modal information, such as text, image, and audio-emotional modalities, to guide facial pose and expression in generated videos. This approach aims to complement emotional content in textual information and enhance the vividness of the generated videos.', source_nodes=[NodeWithScore(node=TextNode(id_='e191431b-e323-4795-b7cd-c66750890fff', embedding=None, metadata={'file_name': 'Deepfake.pdf', 'doc_id': '10121698-98b4-4107-8cb3-7b0b7e8285c7', 'is_document': True}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='b8b8a401-7a3b-4c13-8083-c573b1b3ee65', node_type=None, metadata={}, hash=None), RelatedNodeInfo(node_id='c1927463-7e56-44bb-87f1-fa43298afd26', node_type=None, metadata={}, hash=None), RelatedNodeInfo(node_id='2ad9aefb-a183-45a3-bbf4-1f4fe8e4ecf8', node_type=None, metadata={}, hash=None), Relate

# Ragas

In [14]:
from ragas.testset.generator import TestsetGenerator
import random

# Initialize a TestsetGenerator using its default settings.
# TestsetGenerator is used for generating test datasets, typically for model evaluation or testing.
# The 'from_default' method sets up the generator with default configurations.
testsetgenerator = TestsetGenerator.from_llama_index(
    generator_llm=llm,
    critic_llm=llm,
    embeddings=embedding_model,
)

# Specify the sample size for the source documents.
# This determines how many documents will be randomly selected from the source documents.
sample_size = 6

# Define the number of questions to be included in the test set.
# This will set how many test cases or questions the test set will contain.
num_questions = 15

# Generate a test dataset from a random sample of source documents.
# 'random.sample' is used to randomly select a subset of documents from the source.
# The test set is then generated based on these documents.
# Parameters:
#   random.sample(source_docs, sample_size): A randomly selected subset of source documents.
#   test_size: The number of questions or test cases to generate in the test set.
testset = testsetgenerator.generate_with_llamaindex_docs(
    random.sample(source_docs, sample_size),  # Randomly selected documents
    test_size=num_questions,               # Number of questions in the test set 
)

embedding nodes:   0%|          | 0/14 [00:00<?, ?it/s]

Generating:   0%|          | 0/16 [00:00<?, ?it/s]

In [15]:
import re

test_df = testset.to_pandas()
# Define the regex pattern to match any character that is NOT a letter, a number, '.', ',', or '?'
pattern = r"[^a-zA-Z0-9.,? ]"

# Define a function to replace special characters in a string
def remove_special_chars(s):
    return re.sub(pattern, '', str(s))

# Apply the function to each cell in the DataFrame
test_df = test_df.applymap(remove_special_chars)


test_questions = test_df['question'].values.tolist()
test_answers = [[item] for item in test_df['ground_truth'].values.tolist()]

test_df

  test_df = test_df.applymap(remove_special_chars)


Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How does the modified code correctly calculate...,"Qwen14BChat RLHF nnnnnndefmaxDepthself, root ...",The modified code correctly calculates the max...,simple,"pagelabel 56, filename Qwen.pdf, filepath kagg...",True
1,How does the modified code correctly handle th...,"Qwen14BChat RLHF nnnnnndefmaxDepthself, root ...","In this code, when each node is extracted, we ...",simple,"pagelabel 56, filename Qwen.pdf, filepath kagg...",True
2,What are the average scores of the proprietary...,Table 13 Results on MMLU . All are tested with...,The answer to given question is not present in...,simple,"pagelabel 37, filename Qwen.pdf, filepath kagg...",True
3,How does the text normalization process impact...,Robust Speech Recognition via LargeScale Weak ...,The text normalization process impacts the per...,simple,"pagelabel 12, filename Wisper.pdf, filepath ka...",True
4,What is the approach used to train the chat mo...,"QWEN TECHNICAL REPORTnJinze Bai, Shuai Bai, Yu...",The approach used to train the chat models in ...,simple,"pagelabel 1, filename Qwen.pdf, filepath kaggl...",True
5,How can the punchline be revealed upon button ...,DOCTYPE htmlnhtmlnheadntitleMy Joke Websitetit...,The answer to given question is not present in...,simple,"pagelabel 16, filename VisionInstruction.pdf, ...",True
6,How does the use of multilingual and multitask...,Robust Speech Recognition via LargeScale Weak ...,Multilingual and multitask models benefit more...,simple,"pagelabel 12, filename Wisper.pdf, filepath ka...",True
7,What changes were made in the modified code to...,"Qwen14BChat RLHF nnnnnndefmaxDepthself, root ...","In the modified code, when each node is extrac...",simple,"pagelabel 56, filename Qwen.pdf, filepath kagg...",True
8,Whats the purpose of the queue in the code and...,"def maxDepth s e l f , r o o t TreeNode i ...","Your code is correct, the answer is correct. I...",reasoning,"pagelabel 55, filename Qwen.pdf, filepath kagg...",True
9,What are the average scores of the proprietary...,Table 13 Results on MMLU . All are tested with...,The answer to given question is not present in...,reasoning,"pagelabel 37, filename Qwen.pdf, filepath kagg...",True


In [16]:
test_df.to_csv("test_dataset.csv", index=False, encoding='utf-8')

In [38]:
import asyncio
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_similarity,
    answer_correctness
)
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from ragas.integrations.llama_index import evaluate
import pandas as pd
import time
import httpx 

# List of evaluation metrics functions to be used.
metrics = [
    faithfulness,           # Evaluates faithfulness of the response to the source material.
    answer_relevancy,       # Assesses relevance of the response to the query.
    context_precision,      # Measures precision of the context in the response.
    context_recall,         # Measures recall of the context in the response.
    answer_correctness,     # Checks correctness of the answer.
    answer_similarity,      # Evaluates similarity of the answer to a reference answer.
]

# A list to collect individual result DataFrames.
results_list = []

In [39]:
@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=1, max=60), retry=retry_if_exception_type(httpx.HTTPStatusError))
def safe_evaluate(query_engine, metrics, dataset, llm, embeddings):
    return evaluate(query_engine=query_engine, metrics=metrics, dataset=dataset, llm=llm, embeddings=embeddings,raise_exceptions=False)

In [40]:
def evaluate_and_append(query_engine, technique):
    # Evaluate the query engine.
    result = safe_evaluate(query_engine=query_engine, metrics=metrics, dataset=test_df, llm=llm, embeddings=embedding_model)
    # Add a 'technique' column to the result DataFrame.
    result['technique'] = technique

    # Add the result DataFrame to the results list.
    results_list.append(result)

    # Sleep to handle rate limits.
    # time.sleep(60)

In [41]:
evaluate_and_append(baseline_query_engine, 'chunks_with_overlap')

Running Query Engine:   0%|          | 0/12 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

In [42]:
# Convert each Result object's items to a dictionary and collect them in a list
dict_list = [dict(result.items()) for result in results_list]

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(dict_list)

results_df

Unnamed: 0,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,answer_similarity,technique
0,0.949074,0.821416,0.75,0.741667,0.559331,0.625103,chunks_with_overlap
