In [None]:
import os
import time
import fitz  # PyMuPDF
from openai import OpenAI
import pandas as pd
import re  # Add this at the top with other imports

# Initialize OpenAI client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=""
)

# Updated prompt for both MCQ and open-ended questions
prompt = """Generate 6 questions (2 MCQ and 4 open-ended) with the following EXACT formats:

MCQ Format:
<s>[INST]@ESE577. [Natural question about ML concepts without explicitly referencing chapters]
A) [option1]
B) [option2]
C) [option3]
D) [option4]
[/INST]Answer: [letter]. [Detailed explanation connecting to course material (e.g., "This concept, covered in Chapter X.Y, demonstrates...") and explaining underlying principles]</s>

Open-ended Format:
<s>[INST]@ESE577. [Conceptual/analytical question focused on understanding rather than chapter recall][/INST][Comprehensive explanation that:
- Links back to specific course sections
- Uses course terminology and notation
- Shows connections between topics
- Includes relevant mathematical foundations]</s>

Guidelines:
1. Question Design:
  - Frame questions naturally without forced chapter references
  - Test understanding rather than recall
  - Focus on concepts, principles, and applications
  - Include practical ML scenarios
  - Encourage critical thinking

2. Answer Requirements:
  - Reference relevant course sections (e.g., "As explained in Chapter X.Y...")
  - Use precise mathematical notation from the course
  - Connect concepts across different topics
  - Include course-specific terminology
  - Explain principles thoroughly

3. MCQ Specifics:
  - Clear, natural language
  - Plausible distractors
  - Similar-length options
  - One definitively correct answer

4. Open-ended Specifics:
  - Analytical depth
  - Real-world applications
  - Integration of concepts
  - Concise but thorough

Questions should feel like natural ML discussions while answers ground them in course material."""

def extract_text_from_page(pdf_path, page_number):
    """Extracts text from a specified page of a PDF."""
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    text = page.get_text()
    doc.close()
    return text.strip()

def process_page(page_num, content, qa_pairs):
    message_content = f"""Content from MIT notes page {page_num}: "{content}"
    {prompt}"""
    
    try:
        completion = client.chat.completions.create(
            extra_headers={
                "HTTP-Referer": "https://your-site.com",
                "X-Title": "ESE577-QA-Generator",
            },
            model="anthropic/claude-3.5-sonnet:beta",
            messages=[{"role": "user", "content": message_content}]
        )
        
        response = completion.choices[0].message.content.strip()
        pairs = response.split('<s>')
        
        for pair in pairs:
            if '[INST]' in pair and '[/INST]' in pair:
                try:
                    # Extract question
                    question_part = pair.split('[/INST]')[0]
                    question = question_part.split('[INST]')[1].strip()
                    question = question.replace('@ESE577.', '').strip()
                    
                    # Extract answer
                    answer = pair.split('[/INST]')[1]
                    
                    # Clean the answer using regex
                    answer = re.sub(r'</s>.*$', '', answer, flags=re.DOTALL)  # Remove </s> and everything after
                    answer = re.sub(r'Open-ended:.*$', '', answer, flags=re.DOTALL)  # Remove Open-ended and everything after
                    answer = answer.strip()
                    
                    # Add to qa_pairs if both question and answer are non-empty
                    if question and answer:
                        qa_pairs.append({
                            'question': question,
                            'answer': answer
                        })
                except Exception:
                    continue
                    
    except Exception:
        pass
    
    time.sleep(3)  # Rate limiting

def main():
    pdf_path = 'raw_dataset.pdf'
    output_file = 'data/qa_pairs.csv'
    
    os.makedirs("data", exist_ok=True)
    qa_pairs = []
    
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()
    
    print(f"Processing {total_pages} pages...")
    
    for page_num in range(total_pages):
        try:
            content = extract_text_from_page(pdf_path, page_num)
            process_page(page_num, content, qa_pairs)
            print(f"Processed page {page_num + 1}/{total_pages}")
        except Exception:
            continue
    
    # Save to CSV
    df = pd.DataFrame(qa_pairs)
    df.to_csv(output_file, index=False)
    print(f"\nComplete! Generated {len(qa_pairs)} QA pairs")

if __name__ == "__main__":
    main()

In [6]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'data/qa_pairs.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Shuffle the DataFrame
shuffled_df = df.sample(frac=1, random_state=42)  # random_state ensures reproducibility

# Save the shuffled DataFrame back to a CSV file (optional)
shuffled_df.to_csv('shuffled_file.csv', index=False)

# Display the first few rows of the shuffled DataFrame
print(shuffled_df.head())


                                              question  \
198  Compare and contrast the ReLU and step activat...   
349  Compare and contrast the initialization strate...   
33   How does the concept of regret in machine lear...   
208  Which expression correctly represents the grad...   
93   What is the primary purpose of the learning ra...   

                                                answer  
198  The course material in Section 6.3 presents th...  
349  The initialization strategy (Qold(s,a) = 0) sh...  
33   Regret, as presented in the performance metric...  
208  Answer: B. As shown in Section 6.5.1, the grad...  
93   Answer: C. This concept, covered in the gradie...  
