In [12]:
!cp -r /root/.cache/huggingface /workspace/AAIPL/hf_models

In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Path where your models exist
MODEL_ROOT = "/workspace/AAIPL/hf_models/huggingface"

MODEL_FOLDER = "models--Qwen--Qwen2.5-14B-Instruct"
SNAPSHOTS = os.path.join(MODEL_ROOT, MODEL_FOLDER, "snapshots")

# Get actual snapshot folder
snapshot_hash = os.listdir(SNAPSHOTS)[0]
LOCAL_MODEL_PATH = os.path.join(SNAPSHOTS, snapshot_hash)

print("Loading model from:", LOCAL_MODEL_PATH)

tokenizer = AutoTokenizer.from_pretrained(
    LOCAL_MODEL_PATH,
    local_files_only=True
)

model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_PATH,
    torch_dtype=torch.float16,
    local_files_only=True
).to("cuda")

model.eval()

print("Model loaded successfully on GPU.")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading model from: /workspace/AAIPL/hf_models/huggingface/models--Qwen--Qwen2.5-14B-Instruct/snapshots/cf98f3b3bbb457ad9e2bb7baf9a0125b6b88caa8


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Model loaded successfully on GPU.


In [20]:
def build_prompt(topic: str) -> str:
    import random
    SYLLOGISM_DOMAINS = [
    "Professions",
    "Scientific fields",
    "Companies and organizations",
    "Geographic locations",
    "Vehicles and machines",
    "Academic disciplines",
    "Musical genres",
    "Technologies",
    "Historical groups",
    "Kitchen utensils",
    "Sports teams"
]

    topic_constraints = ""
    if topic == "Syllogisms":
        domain = random.choice(SYLLOGISM_DOMAINS)
        topic_constraints = """
- Minimum 3 premises.
- Include at least one universal (All/No) and one particular (Some/Not all).
- Include at least one negative or restrictive statement.
- Require interaction of at least TWO premises.
- Avoid trivial direct chaining (Aâ†’Bâ†’C only).
- Avoid repetition of a single domain.
- Use categories related to: {domain}.
- At least one distractor must appear logically plausible.
"""

    elif topic == "Seating Arrangements (Circular and Linear)":
        topic_constraints = """
- 6 to 8 named individuals.
- Clearly state whether arrangement is Circular or Linear.
- At least 4 constraints.
- Include at least one negative constraint (e.g., not adjacent).
- Require multi-step deduction.
"""

    elif topic == "Blood Relations and Family Tree":
        topic_constraints = """
- Minimum 5 individuals.
- Include at least TWO generations.
- Require at least TWO inferential steps.
- Avoid trivial sibling-only relations.
- Include indirect relations (e.g., maternal uncle, niece, grandfather).
- The final question must not be answerable in one obvious step.
"""

    elif topic == "Mixed Series (Alphanumeric)":
        topic_constraints = """
- Combine letters and numbers.
- Pattern must require at least TWO operations.
- Avoid simple +1 increments.
- Require reasoning to detect hidden pattern.
"""

    prompt = f"""
You are generating ONE difficult logical reasoning MCQ.

Topic: {topic}

Requirements:
{topic_constraints}

General Rules:
- Exactly 4 answer choices labeled A, B, C, D.
- Exactly ONE correct answer.
- No duplicate choices.
- Explanation must be concise (maximum 80 words).
- Output ONLY valid JSON.
- Do NOT include markdown.
- Do NOT include text outside JSON.
- Generate completely NEW content.

Return JSON in this exact structure:

{{
    "topic": "{topic}",
    "question": "Your question ending with a question mark?",
    "choices": [
        "A) First option",
        "B) Second option",
        "C) Third option",
        "D) Fourth option"
    ],
    "answer": "A",
    "explanation": "Concise reasoning explanation."
}}
"""
    return prompt


In [21]:

def generate_question(topic):

    prompt = build_prompt(topic)

    messages = [{"role": "user", "content": prompt}]

    chat = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(chat, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # Extract JSON safely
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        # fallback: extract JSON block if extra whitespace
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match:
            try:
                return json.loads(match.group(0))
            except:
                return None
        return None

In [14]:
q = generate_question("Blood Relations")
q


{'topic': 'Blood Relations',
 'question': "If X is the brother of Y's father's only daughter and Z is the son of X, how is Z related to Y?",
 'choices': ['A) Nephew', 'B) Brother', 'C) Cousin', 'D) Uncle'],
 'answer': 'C',
 'explanation': "X is Y's aunt, making Z Y's cousin."}

In [16]:
q = generate_question("Syllogisms")
q


{'topic': 'Syllogisms',
 'question': 'Some artists are not painters. All sculptors are painters. Not all artists are sculptors. Which of the following can be concluded?',
 'choices': ['A) Some artists are not sculptors.',
  'B) All painters are artists.',
  'C) No sculptor is an artist.',
  'D) Some sculptors are not artists.'],
 'answer': 'A',
 'explanation': "From 'Some artists are not painters' and 'All sculptors are painters', it follows that these non-painter artists cannot be sculptors, thus some artists are not sculptors."}

In [22]:
import json

FILE_PATH = "generated_mcq_dataset.json"

with open(FILE_PATH, "r") as f:
    data = json.load(f)

original_length = len(data)
print("Original length:", original_length)

# Remove last 60 entries safely
if original_length >= 60:
    data = data[:-60]
else:
    print("Warning: Less than 60 entries found.")
    data = []

print("New length:", len(data))

# Save back to file
with open(FILE_PATH, "w") as f:
    json.dump(data, f, indent=2)

print("Last 60 entries removed successfully.")


Original length: 950
New length: 890
Last 60 entries removed successfully.


In [24]:
import json
import os
from tqdm import tqdm
stop_generation =False
OUTPUT_FILE = "generated_mcq_dataset (1).json"
TARGET_PER_TOPIC = 300
PAUSE_INTERVAL = 50

TOPICS = [
    "Syllogisms",
    "Seating Arrangements (Circular and Linear)",
    "Blood Relations and Family Tree",
    "Mixed Series (Alphanumeric)"
]

def validate_mcq(data):
    if not isinstance(data, dict):
        return False
    required = {"topic", "question", "choices", "answer", "explanation"}
    if not required.issubset(data.keys()):
        return False
    if len(data["choices"]) != 4:
        return False
    if data["answer"] not in ["A", "B", "C", "D"]:
        return False
    return True

# Load existing dataset (resume safe)
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r") as f:
        dataset = json.load(f)
else:
    dataset = []

existing_questions = set(item["question"] for item in dataset)

for topic in TOPICS:
    print(f"\nGenerating for topic: {topic}")
    topic_count = sum(1 for item in dataset if item["topic"] == topic)

    pbar = tqdm(total=TARGET_PER_TOPIC - topic_count)

    while topic_count < TARGET_PER_TOPIC:
        if stop_generation:
            break
            
        data = generate_question(topic)
        if data is None:
            continue

        if not validate_mcq(data):
            continue

        if data["question"] in existing_questions:
            continue

        dataset.append(data)
        existing_questions.add(data["question"])
        topic_count += 1
        pbar.update(1)

        # Save every 10
        if len(dataset) % 10 == 0:
            with open(OUTPUT_FILE, "w") as f:
                json.dump(dataset, f, indent=2)

        # ðŸ”¥ Pause every 50 per topic
        if topic_count % PAUSE_INTERVAL == 0:
            # Save before pause
            with open(OUTPUT_FILE, "w") as f:
                json.dump(dataset, f, indent=2)

            print("\n" + "="*60)
            print(f"PAUSED at {topic_count} questions for topic: {topic}")
            print("Preview of last 3 questions:\n")

            for item in dataset[-3:]:
                print(f"Q: {item['question']}")
                print(f"Answer: {item['answer']}")
                print("-" * 60)

            user_input = input("Type 'c' to continue or 'q' to stop safely: ")

            if user_input.lower() == 'q':
                print("Stopping safely. Progress saved.")
                stop_generation = True
                break

    pbar.close()
    if stop_generation:
        break

# Final Save
with open(OUTPUT_FILE, "w") as f:
    json.dump(dataset, f, indent=2)

print("\nDataset generation complete.")



Generating for topic: Syllogisms



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [8:22:22<00:00, 120.57s/it]
0it [00:00, ?it/s]



Generating for topic: Seating Arrangements (Circular and Linear)


0it [00:00, ?it/s]



Generating for topic: Blood Relations and Family Tree


0it [00:00, ?it/s]



Generating for topic: Mixed Series (Alphanumeric)


0it [00:00, ?it/s]


Dataset generation complete.



