In [1]:
! pip install -q pandas openai

In [2]:
! python --version

Python 3.10.12


In [3]:
! pip install openai --upgrade



In [4]:
from openai import OpenAI
import openai
import pandas as pd
client = OpenAI(api_key = 'OPENAI_API_KEY')



In [5]:
! pip install -q sentence-transformers
! pip install -q pinecone-io
! pip install -q pandas
! pip install -q  transformers

In [6]:
! pip install -q faiss-cpu

In [7]:
! pip install -q pinecone-client

In [8]:
! pip install tabulate



In [9]:
! pip install datasets



In [16]:
import pandas as pd
from transformers import RagTokenizer, RagTokenForGeneration, RagRetriever
import openai
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pinecone import Pinecone
from tabulate import tabulate

# Initialize RAG Model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# Initialize Pinecone
pc = Pinecone(api_key="PINECONE_KEY")
index = pc.Index("set-a")
pinecone_dim = 384
page_size = 10000

# Function to fetch all vectors and their metadata from Pinecone namespace
def get_all_vectors_from_namespace(index,namespace):
    ret = []
    dummy_vector = [0 for _ in range(pinecone_dim)]
    res = index.query(
        namespace=namespace,
        vector=dummy_vector,
        top_k=page_size,
        include_values=True,
        include_metadata=True
    )
    for match in res['matches']:
        ret.append({
            'id': match['id'],
            'vector': match['values'],  # Adjust this if 'values' is not the correct key
            'metadata': match['metadata']
        })
    return ret

vector_data = get_all_vectors_from_namespace(index,'Questions')
id_to_question = {item["id"]: item["metadata"]["Question"] for item in vector_data}
answer_vector_data = get_all_vectors_from_namespace(index,'Answers')
id_to_answer = {item["id"]: item["metadata"]["Answer"] for item in answer_vector_data}  # Assuming answers are stored

# Load datasets
df = pd.read_csv("dataset/SETB.csv")

# Initialize Sentence Transformer for encoding
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to encode questions
def encode_questions(texts):
    return sentence_model.encode(texts)

df['question_embeddings'] = df['Questions'].apply(lambda x: encode_questions([x])[0])

def get_answer_from_chatgpt(question, similar_questions):
    """
    Get an answer from OpenAI's ChatGPT using the provided question and context from similar questions.

    Parameters:
    question (str): The main question to answer.
    similar_questions (list of str): A list containing three similar questions to provide context.

    Returns:
    str: The response from ChatGPT.
    """
    # Preparing the messages with additional context from similar questions
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": similar_questions[0]},
        {"role": "user", "content": similar_questions[1]},
        {"role": "user", "content": similar_questions[2]},
        {"role": "user", "content": question}
    ]

    # Creating the chat completion
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    # Returning the content of the response
    return response.choices[0].message.content




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [17]:
# Compare embeddings and retrieve most similar questions
results = []
for idx, row in df.iterrows():
    question_embedding = row['question_embeddings']
    try:
        similarities = cosine_similarity([question_embedding], [v['vector'] for v in vector_data])
        top_indices = np.argsort(similarities[0])[::-1][:3]  # Top 3 most similar
        similar_questions = []
        similarity_scores = []

        for i in top_indices:

            pinecone_question = id_to_question[vector_data[i]['id']]
            pinecone_answer = id_to_answer[vector_data[i]['id']]
            similar_questions.append(
                'SET A Question:' + pinecone_question +
                'SET A Answer:' + pinecone_answer
            )
            similarity_scores.append(str(similarities[0][i]))
        chatgpt_answer = get_answer_from_chatgpt(row['Questions'], similar_questions)
        results.append({'Question': '\n'.join(similar_questions),'Similarity Scores': '\n'.join(similarity_scores), 'SET B Question': row['Questions'], 'Chat GPT Answer': chatgpt_answer})

    except KeyError as e:
        print(f"Error accessing vector data: {e}")

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df.head(3))  # Displaying top 3 results
print(tabulate(results_df, headers='keys', tablefmt='psql'))

                                            Question  \
0  SET A Question:Which of the following steps is...   
1  SET A Question:Which of the following is the c...   
2  SET A Question:Which of the following best des...   

                                   Similarity Scores  \
0  0.8298492034634594\n0.8052389114137042\n0.7437...   
1  0.9882112979153546\n0.7748807928165015\n0.6881...   
2  0.8357440524514059\n0.7245225490107088\n0.6307...   

                                      SET B Question  \
0  Which step in a data analysis project involves...   
1  Which of the following represents the correct ...   
2  Which of the following best describes the obje...   

                                     Chat GPT Answer  
0                       B) Exploratory data analysis  
1  Correct answer: D) Data collection, data explo...  
2  Correct Answer: B) To clean and transform raw ...  
+----+---------------------------------------------------------------------------------------------------