In [1]:
import pandas as pd
import numpy as np
import h5py
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.width', 1000)  # Set max width

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

API_KEY = os.environ.get('OPENAI_API_KEY')

In [None]:
def preprocess_text(text):
    # Define interrogative words to KEEP
    interrogatives = {"what", "why", "how", "who", "where", "when", "which", "whom", "whose", "no", "not",
                    "very" ,"too" ,"too" ,"just", "if", "but", "however", "without", "like"}
    custom_stopwords = set(nlp.Defaults.stop_words)
    custom_stopwords -= interrogatives

    doc = nlp(text.lower().strip())  # Lowercase and remove whitespace
    
# Process tokens: lemmatize, filter stopwords/punct/numbers, keep interrogatives
    tokens = [
        token.lemma_ 
        for token in doc 
        if (
            (not token.is_stop or token.text in interrogatives) and  # Keep interrogatives
            not token.is_punct and token.is_alpha                                  # Remove punctuation
            # (token.is_alpha or token.like_num)                       # Keep words/numbers
        )
    ]

    return ' '.join(tokens)

In [None]:
label_mapper = {
    'BT1' : 'knowledge',
    'BT2' : 'comprehension',
    'BT3' : 'application',
    'BT4' : 'analysis',
    'BT5' : 'synthesis',
    'BT6' : 'evaluation'
}

# Load dataset
df = pd.DataFrame()
for i in range(1,5):
    q_df = pd.read_csv(os.getcwd().replace('notebook' , 'dataset') + '/dataset' + str(i) + '.csv')
    df = pd.concat([df , q_df])

# Apply preprocessing
mask = df['label'].isin(label_mapper.keys())
df['label'] = df['label'].mask(mask, df['label'].map(label_mapper))

df['label'] = df['label'].str.lower()

df['processed_question'] = df['question'].apply(preprocess_text)

df['processed_question'] = [''.join(text) for text in df['processed_question']]

In [None]:
import openai
from openai import OpenAI
from typing import List
from pymilvus import MilvusClient

# Set up OpenAI GPT-3.5
openai_client = OpenAI()
# Connect to Milvus
client = MilvusClient("milvus_demo.db")
# Set your API key
client = OpenAI(api_key=API_KEY)  # replace with your actual key or use env vars

def generate_synthetic_queries(prompt: str, label: str, num: int = 5) -> List[str]:
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": (
                    f"Generate {num} alternative or related questions for the query: "
                    f"'{prompt}' that satisfy Bloom's taxonomy level: {label}."
                )
            }
        ]
    )
    
    text = completion.choices[0].message.content
    return [line.strip("- ").strip() for line in text.strip().split("\n") if line.strip()]

# Example use
query = "Why is the sky blue?"
label = "Understanding"

synthetic_queries = generate_synthetic_queries(query, label, 10)
print(synthetic_queries)


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1) Load Flan-T5 (no gated access needed)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def generate_queries(prompt: str, label: str, num: int = 5) -> list[str]:
    input_text = f"[INST] Generate {num} related questions for: {prompt} that satisfy Bloom's taxonomy level: {label}[/INST]"
    inputs  = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, top_p=0.9, temperature=0.8)
    text    = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return [q.strip("- ").strip() for q in text.split("\n") if q.strip()]

# Example
qs = generate_queries("Why is the sky blue?", "Understanding", num=10)
print(qs)


['Which is the most popular song of the year?']


## OPENAI

In [3]:

from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain_community.vectorstores.faiss import FAISS
from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI

In [6]:
hyde_prompt = PromptTemplate.from_template(
    "Generate a hypothetical answer for the question at Bloom's Understanding level:\n\n"
    "Question: {question}\n\nAnswer:"
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", api_key = API_KEY)  # or another free/open model
llm_chain = LLMChain(llm=llm, prompt=hyde_prompt)


In [7]:
llm_chain

LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template="Generate a hypothetical answer for the question at Bloom's Understanding level:\n\nQuestion: {question}\n\nAnswer:"), llm=ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x321cfb220>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x17a773b50>, root_client=<openai.OpenAI object at 0x323c095d0>, root_async_client=<openai.AsyncOpenAI object at 0x321cfb250>, model_kwargs={}, openai_api_key=SecretStr('**********')), output_parser=StrOutputParser(), llm_kwargs={})

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1) Load Flan-T5 (no gated access needed)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def gen_blooms_queries(prompt: str, k: int = 10):
    # 2) Plain-English instruction
    instruction = (
        f"Generate {k} questions at Bloom's 'Understanding' level for: {prompt}"
    )
    inputs  = tokenizer(instruction, return_tensors="pt")
    # 3) Beam search for diverse, coherent output
    outs    = model.generate(
        **inputs,
        max_new_tokens=150,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    text    = tokenizer.decode(outs[0], skip_special_tokens=True)
    # 4) Split into individual questions
    return [q.strip() for q in text.split("?") if q.strip()][:k]

# Example
queries = gen_blooms_queries("Why is the sky blue?", k=10)
print(queries)


['Why is the sky blue']
