In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pandas as pd
import logging
import sys
import config
import json
from sentence_transformers import SentenceTransformer, util
import gzip
import os
import torch

# Import OpenAI and other necessary modules
import openai

# Import classes and functions from modules
from llama_index import (
    Document,
    VectorStoreIndex,
    ListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SimpleKeywordTableIndex,
)
from llama_index.indices.postprocessor import (
    LLMRerank
)
from llama_index.response.notebook_utils import display_response
from llama_index.llms import OpenAI

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


In [2]:
#OpenAI API Key Authentication (The OpenAI API Key will be stored in the config.py file)
openai.api_key = config.openai_key

In [3]:
# Initialize OpenAI's LLM (Language Learning Model)
llm = OpenAI(model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm)

In [4]:
# Read and process query, document, and relevance data
df_queries = pd.read_csv('antique_query_test.csv')
df_queries = df_queries[['query_id','text']]

df_docs = pd.read_csv('antique_sample_404k.csv')
df_docs = df_docs[['doc_id','text']]

df_qrel = pd.read_csv('antique_qurel_test.csv')
df_qrel = df_qrel[['query_id','doc_id','relevance']]

# Merge relevant data for query and document
merged_df = df_qrel.merge(df_docs, on='doc_id', how='left')

# Extract text data from merged DataFrame
df_text = merged_df[['doc_id','text']]

# Initialize an empty list to store passages
passages = []

# Iterate through each row in the 'df_text' DataFrame and append text to the 'passages' list
for index, row in df_text.iterrows():
    passages.append(str(row['text']))

In [5]:
# Load pre-computed document embeddings
import pickle
with open('corpus_embeddings_text.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

In [6]:
# Initialize SentenceTransformer for embedding
bi_encoder = SentenceTransformer('intfloat/e5-base-v2')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
# top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

Load pretrained SentenceTransformer: intfloat/e5-base-v2
Created a temporary directory at /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmpyns87k4m
Writing /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmpyns87k4m/_remote_module_non_scriptable.py
Use pytorch device: cpu


In [7]:
# Function to perform semantic search

# This function will search all the articles for passages that
# answer the query
def search(input_query):  
    output_answers=[]  
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(input_query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, doc_embedding, top_k=10)
    hits = hits[0]  # Get the hits for the first query

    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits:
        output_answer =  passages[hit['corpus_id']].replace("\n", " ")
        output_answers.append(output_answer)  # Append to the list

    return output_answers  # Return the list of output answers

In [8]:
# Function to perform LLM Reranking

def rerank__query_engine(question, retrieved_docs):
    # Create Document objects for the retrieved documents
    documents = [Document(text=t) for t in retrieved_docs]

    # Create a VectorStoreIndex from the retrieved documents
    retrieved_docs_index = VectorStoreIndex.from_documents(documents)
    
    # Initialize an LLM reranker
    reranker = LLMRerank(top_n=10)

    # Create a query engine with reranking  
    query_engine = retrieved_docs_index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[reranker],
    )

    # Query the engine
    response = query_engine.query(question)
    
    display_response(
        response, show_source=True, source_length=500
        )

In [9]:
def ask_question(question):
    # Perform semantic search on a sample query
    retrieved_docs = search(input_query = question)
    # Perform LLM rerank 
    rerank__query_engine(question = question, retrieved_docs=retrieved_docs)

In [10]:
ask_question(question = "How to find out if a watermelon is ripe?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** To find out if a watermelon is ripe, you can check for a few signs. Look for a yellow spot on the underside of the watermelon, which indicates where the watermelon has been resting on the ground. The spot should be creamy yellow, not white. You can also thump the watermelon and listen for a deep hollow sound, which indicates that the watermelon is ripe. Finally, you can lift the watermelon and check for its weight. A ripe watermelon should feel heavy for its size.

---

**`Source Node 1/2`**

**Node ID:** 4f5341c9-6fbe-40fb-b5d7-ede8f948bed2<br>**Similarity:** 3.0<br>**Text:** You can bake them, just like a regular baked potato. Peirce the skin with a fork to let steam escape...They take a little longer than a reg pot, I think cause theyre denser?. You can eat them with brown sugar, cinemon & butter....<br>

---

**`Source Node 2/2`**

**Node ID:** 9393a5cc-b916-430b-b27d-b43acb716260<br>**Similarity:** 2.0<br>**Text:** leave the skins on and cut into cubes. then drizzle with olive oil or butter, dried herbs & salt. bake until skin flesh browns. Also makes good garlic mashed potatoes....<br>