In [18]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import anthropic

### Preprocess the "documents" in openbook.txt:

In [3]:
with open("data\Main\openbook.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()
documents = [line.strip() for line in lines]

### Creating Embedding model and using it on OpenBookQA "documents":
- This works as the context retrieval

In [5]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

doc_embeddings = embedder.encode(documents, convert_to_numpy=True)

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

In [15]:
# Query embedding
query = "The sun is responsible for"
query_embedding = embedder.encode([query])

# Search FAISS index
D, I = index.search(np.array(query_embedding), k=1)
print("Best match:", documents[I[0][0]])

Best match: "the sun is a source of heat called sunlight"


In [38]:
# Function to retrieve similar documents
def retrieve_context(query, k=1):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)  # Retrieve top-k similar docs
    return [documents[i] for i in indices[0]]

In [41]:
retrieve_context(query, k=6)

['"the sun is a source of heat called sunlight"',
 '"sunlight produces heat"',
 '"the sun is a source of light called sunlight"',
 '"the sun is the source of solar energy called sunlight"',
 '"sunlight contains ultraviolet light"',
 '"the sun transfers solar energy from itself to the Earth through sunlight"']

### Using Claude API to expand on the "queries":

In [46]:
# Function to generate response using Claude
def generate_response(query):
    retrieved_docs = retrieve_context(query)
    context = "\n".join(retrieved_docs)

    response = client.beta.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=200,
        messages=[{"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}]
    )
    
    return response  # Return Claude’s response

In [None]:
generate_response(query)

### Testing on training set:

In [51]:
import json

def read_jsonl(file_path):
    """
    Reads a .jsonl file and returns a list of JSON objects.

    Args:
        file_path (str): Path to the .jsonl file.

    Returns:
        list: A list of dictionaries representing the JSON objects.
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

# Example usage
file_path = "data/Main/train.jsonl"  # Replace with your actual file path
data = read_jsonl(file_path)

# Print the first entry to check
print(data[0])

{'id': '7-980', 'question': {'stem': 'The sun is responsible for', 'choices': [{'text': 'puppies learning new tricks', 'label': 'A'}, {'text': 'children growing up and getting old', 'label': 'B'}, {'text': 'flowers wilting in a vase', 'label': 'C'}, {'text': 'plants sprouting, blooming and wilting', 'label': 'D'}]}, 'answerKey': 'D'}


In [52]:
questions    = [question['question']['stem'] for question in data]
possible_ans = [question['question']['choices'] for question in data]

In [48]:
question = data["question"]["stem"]
choices = data["question"]["choices"]
formatted_choices = "\n".join([f"({choice['label']}) {choice['text']}" for choice in choices])

prompt = f"""
Question: {question}
{formatted_choices}

Choose the best answer from the options (A, B, C, or D) and explain your reasoning.
""".strip()

prompt

['"A bee is a pollinating animal"',
 '"A bird is a pollinating animal"',
 '"An electrical conductor is a vehicle for the flow of electricity"',
 '"An example of a change in the Earth is an ocean becoming a wooded area"',
 '"An example of a chemical change is acid breaking down substances"']