# Import Libraries

In [14]:
import pandas as pd
import numpy as np
import h5py
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import os
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.width', 1000)  # Set max width

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

API_KEY = os.environ.get('OPENAI_API_KEY')

# Modules

In [None]:
def preprocess_text(text):
    # Define interrogative words to KEEP
    interrogatives = {"what", "why", "how", "who", "where", "when", "which", "whom", "whose", "no", "not",
                    "very" ,"too" ,"too" ,"just", "if", "but", "however", "without", "like"}
    custom_stopwords = set(nlp.Defaults.stop_words)
    custom_stopwords -= interrogatives

    doc = nlp(text.lower().strip())  # Lowercase and remove whitespace
    
# Process tokens: lemmatize, filter stopwords/punct/numbers, keep interrogatives
    tokens = [
        token.lemma_ 
        for token in doc 
        if (
            (not token.is_stop or token.text in interrogatives) and  # Keep interrogatives
            not token.is_punct and token.is_alpha                                  # Remove punctuation
            # (token.is_alpha or token.like_num)                       # Keep words/numbers
        )
    ]

    return ' '.join(tokens)

In [None]:
label_mapper = {
    'BT1' : 'knowledge',
    'BT2' : 'comprehension',
    'BT3' : 'application',
    'BT4' : 'analysis',
    'BT5' : 'synthesis',
    'BT6' : 'evaluation'
}

# Load dataset
df = pd.DataFrame()
for i in range(1,5):
    q_df = pd.read_csv(os.getcwd().replace('notebook' , 'dataset') + '/dataset' + str(i) + '.csv')
    df = pd.concat([df , q_df])

# Apply preprocessing
mask = df['label'].isin(label_mapper.keys())
df['label'] = df['label'].mask(mask, df['label'].map(label_mapper))

df['label'] = df['label'].str.lower()

df['processed_question'] = df['question'].apply(preprocess_text)

df['processed_question'] = [''.join(text) for text in df['processed_question']]

# Hypothetical Question Generation

In [None]:
# Configure for MPS acceleration (Apple Silicon)
device = "mps" if torch.backends.mps.is_available() else "cpu"
dtype = torch.float32  # MPS currently better with full precision
print(f"Using device: {device}")

Using device: mps


In [11]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline(
    task= "text2text-generation",
    model= model_name,
    tokenizer=tokenizer,
    device = "mps"
    )

prompt = prompt = """Generate 10 short questions similar to 'why sky is blue?' satisfying blooms level 'understanding'. Follow this exact format:

1. Why does the sky appear blue during daytime?
2. How does light scattering affect sunset colors?
...
10.

Questions:"""

results = generator(
    prompt,
    max_length=500,
    num_return_sequences=1,
    temperature=0.7)

if results:
    generated = results[0]["generated_text"].strip()
    questions = []
    for line in generated.split("\n"):
        if line.strip() and any(c.isdigit() for c in line[:3]):
            # Extract question text after number
            q = line.split(". ", 1)[-1].strip()
            if q and q[-1] == "?":  # Simple validation
                questions.append(q)
    
    print(f"Generated ({len(questions)} questions):")
    for i, q in enumerate(questions[:10], 1):
        print(f"{i}. {q}", end= "")
else:
    print("Generation failed")

Device set to use mps


Generated (1 questions):
1. Why does the sky appear blue during daytime? 2. What is the effect of light scattering on sunset colors? 3. What is the effect of light scattering on sunset colors?

In [None]:
questions = re.findall(r"\d+[\.\)]\s*(.*?\?)", generated)
print(questions)