# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import h5py
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.width', 1000)  # Set max width

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

API_KEY = os.environ.get('OPENAI_API_KEY')

  from .autonotebook import tqdm as notebook_tqdm


# Modules

In [None]:
def preprocess_text(text):
    # Define interrogative words to KEEP
    interrogatives = {"what", "why", "how", "who", "where", "when", "which", "whom", "whose", "no", "not",
                    "very" ,"too" ,"too" ,"just", "if", "but", "however", "without", "like"}
    custom_stopwords = set(nlp.Defaults.stop_words)
    custom_stopwords -= interrogatives

    doc = nlp(text.lower().strip())  # Lowercase and remove whitespace
    
# Process tokens: lemmatize, filter stopwords/punct/numbers, keep interrogatives
    tokens = [
        token.lemma_ 
        for token in doc 
        if (
            (not token.is_stop or token.text in interrogatives) and  # Keep interrogatives
            not token.is_punct and token.is_alpha                                  # Remove punctuation
            # (token.is_alpha or token.like_num)                       # Keep words/numbers
        )
    ]

    return ' '.join(tokens)

In [None]:
label_mapper = {
    'BT1' : 'knowledge',
    'BT2' : 'comprehension',
    'BT3' : 'application',
    'BT4' : 'analysis',
    'BT5' : 'synthesis',
    'BT6' : 'evaluation'
}

# Load dataset
df = pd.DataFrame()
for i in range(1,5):
    q_df = pd.read_csv(os.getcwd().replace('notebook' , 'dataset') + '/dataset' + str(i) + '.csv')
    df = pd.concat([df , q_df])

# Apply preprocessing
mask = df['label'].isin(label_mapper.keys())
df['label'] = df['label'].mask(mask, df['label'].map(label_mapper))

df['label'] = df['label'].str.lower()

df['processed_question'] = df['question'].apply(preprocess_text)

df['processed_question'] = [''.join(text) for text in df['processed_question']]

# Hypothetical Question Generation

In [None]:
# Configure for MPS acceleration (Apple Silicon)
device = "mps" if torch.backends.mps.is_available() else "cpu"
dtype = torch.float32  # MPS currently better with full precision
print(f"Using device: {device}")

Using device: mps


### GOOGLE MODEL

In [19]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline(
    task= "text2text-generation",
    model= model_name,
    tokenizer=tokenizer,
    device = "mps"
    )

prompt = prompt = """Generate 10 short questions similar to 'why sky is blue?' satisfying blooms level 'understanding'. Follow this exact format:

1. Why does the sky appear blue during daytime?
2. How does light scattering affect sunset colors?
3. generate diffenrent keep same context as 1 and 2
4. generate diffenrent keep same context as 1 - 3
5. generate diffenrent keep same context as 1 - 4
6. generate diffenrent keep same context as 1 - 5
7. generate diffenrent keep same context as 1 - 6
8. generate diffenrent keep same context as 1 - 7
9. generate diffenrent keep same context as 1 - 8
10. generate diffenrent keep same context as 1 - 9

 there should be 10 different question"""

results = generator(
    prompt,
    max_length=500,
    num_return_sequences=1,
    temperature=0.7)

if results:
    generated = results[0]["generated_text"].strip()
    questions = []
    for line in generated.split("\n"):
        if line.strip() and any(c.isdigit() for c in line[:3]):
            # Extract question text after number
            q = line.split(". ", 1)[-1].strip()
            if q and q[-1] == "?":  # Simple validation
                questions.append(q)
    
    print(f"Generated ({len(questions)} questions):")
    for i, q in enumerate(questions[:10], 1):
        print(f"{i}. {q}", end= "")
else:
    print("Generation failed")

Device set to use mps


Generated (1 questions):
1. Why does the sky appear blue during daytime? 2. How does light scattering affect sunset colors? 3. What is the best way to describe the sky? 4. What is the best way to describe the sky? 5. What is the best way to describe the sky? 6. What is the best way to describe the sky? 7. What is the best way to describe the sky? 8. What is the best way to describe the sky? 9. What is the best way to describe the sky? 10. What is the best way to describe the sky?

In [15]:
questions = re.findall(r"\d+[\.\)]\s*(.*?\?)", generated)
print(questions)

['Why does the sky appear blue during daytime?', 'How does light scattering affect sunset colors?', 'Why does the sky appear blue during twilight?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dawn?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'how does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk?', 'How does the sky appear blue during dusk

### GPT MODEL

In [17]:
model_name = "openai-community/roberta-large-openai-detector"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline(
    task= "text-generation",
    model= model_name,
    tokenizer=tokenizer,
    device = "mps"
    )

prompt = prompt = """Generate 10 short questions similar to 'why sky is blue?' satisfying blooms level 'understanding'. Follow this exact format:

1. Why does the sky appear blue during daytime?
2. How does light scattering affect sunset colors?
3. generate diffenrent keep same context as 1 and 2
4. generate diffenrent keep same context as 1 - 3
5. generate diffenrent keep same context as 1 - 4
6. generate diffenrent keep same context as 1 - 5
7. generate diffenrent keep same context as 1 - 6
8. generate diffenrent keep same context as 1 - 7
9. generate diffenrent keep same context as 1 - 8
10. generate diffenrent keep same context as 1 - 9

 there should be 10 different question"""

results = generator(
    prompt,
    max_length=500,
    num_return_sequences=1,
    temperature=0.7)

if results:
    generated = results[0]["generated_text"].strip()
    questions = []
    for line in generated.split("\n"):
        if line.strip() and any(c.isdigit() for c in line[:3]):
            # Extract question text after number
            q = line.split(". ", 1)[-1].strip()
            if q and q[-1] == "?":  # Simple validation
                questions.append(q)
    
    print(f"Generated ({len(questions)} questions):")
    for i, q in enumerate(questions[:10], 1):
        print(f"{i}. {q}", end= "")
else:
    print("Generation failed")

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at openai-community/roberta-large-openai-detector and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated (2 questions):
1. Why does the sky appear blue during daytime?2. How does light scattering affect sunset colors?

In [18]:
questions = re.findall(r"\d+[\.\)]\s*(.*?\?)", generated)
print(questions)

['Why does the sky appear blue during daytime?', 'How does light scattering affect sunset colors?']


### DEEPSEEK MODEL

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "deepseek-ai/deepseek-r1-3.7b"
device = 'mps'
prompt = prompt = """Generate 10 short questions similar to 'why sky is blue?' satisfying blooms level 'understanding'. Follow this exact format:

1. Why does the sky appear blue during daytime?
2. How does light scattering affect sunset colors?
...
10.

Questions:"""


# Special loading for DeepSeek architecture
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
generator = pipeline(
    task="text-generation",
    model=model_name,
    tokenizer=tokenizer,
    device=device,
    torch_dtype=torch.float16,  # MPS-compatible dtype
    trust_remote_code=True,
    model_kwargs={
        "load_in_8bit": True,  # 8-bit quantization for memory efficiency
        "max_memory": {0: "10GB", "cpu": "30GB"}  # Adjust for your system
    }
)



# Generation parameters optimized for MPS
results = generator(
    prompt,
    max_length=600,  # Increased for better formatting
    num_return_sequences=1,
    temperature=0.65,
    top_k=40,
    repetition_penalty=1.2,
    do_sample=True,
    truncation=True
)

if results:
    generated = results[0]["generated_text"].strip()
    questions = []
    for line in generated.split("\n"):
        if line.strip() and any(c.isdigit() for c in line[:3]):
            # Extract question using multiple delimiters
            parts = line.split(". ", 1) if ". " in line else line.split(") ", 1)
            if len(parts) > 1:
                q = parts[1].strip()
                if q and q[-1] == "?":
                    questions.append(q[0].upper() + q[1:])  # Capitalize first letter
    
    print(f"Generated ({len(questions)} questions):")
    for i, q in enumerate(questions[:10], 1):
        print(f"{i}. {q}")
else:
    print("Generation failed")


OSError: deepseek-ai/deepseek-r1-3.7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`