<a href="https://colab.research.google.com/github/Festuskipkoech/AI-Agents/blob/main/DataPreparationForTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# MIT AI Tutor Fine-tuning Pipeline
# Complete pipeline for processing MIT lecture notes and fine-tuning a model
!pip install PyPDF2
import pandas as pd
import json
import re
from typing import List, Dict
import PyPDF2
import requests
from urllib.parse import urlparse
import os
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import torch

class MITDataProcessor:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.raw_text = ""
        self.processed_data = []

    def extract_pdf_text(self) -> str:
        """Extract text from the MIT PDF"""
        try:
            with open(self.pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                self.raw_text = text
                return text
        except Exception as e:
            print(f"Error extracting PDF: {e}")
            return ""

    def download_pdf(self, url: str) -> str:
        """Download PDF from MIT URL"""
        try:
            response = requests.get(url)
            filename = "mit_lecture_notes.pdf"
            with open(filename, 'wb') as f:
                f.write(response.content)
            self.pdf_path = filename
            return filename
        except Exception as e:
            print(f"Error downloading PDF: {e}")
            return ""

    def extract_sections(self) -> List[Dict]:
        """Extract chapters/sections from the text"""
        sections = []

        # Split by chapters or major sections
        chapter_pattern = r'Chapter \d+|Section \d+|\d+\.\d+\s+[A-Z]'
        chapters = re.split(chapter_pattern, self.raw_text)

        for i, chapter in enumerate(chapters[1:], 1):  # Skip first empty split
            if len(chapter.strip()) > 100:  # Only process substantial sections
                sections.append({
                    'section_id': i,
                    'content': chapter.strip()[:3000],  # Limit length
                    'title': f"Section {i}"
                })

        return sections

    def create_training_examples(self) -> List[Dict]:
        """Convert MIT content into training examples with simplified explanations"""

        training_templates = [
            {
                'instruction': 'Explain this MIT 6.390 concept in simple terms for undergraduate students',
                'complexity': 'beginner'
            },
            {
                'instruction': 'Break down this machine learning concept as an MIT instructor would, but make it accessible',
                'complexity': 'intermediate'
            },
            {
                'instruction': 'Teach this concept using MIT\'s structured approach but with intuitive explanations',
                'complexity': 'structured'
            },
            {
                'instruction': 'Use the Socratic method to help a student understand this ML concept from MIT 6.390',
                'complexity': 'socratic'
            }
        ]

        sections = self.extract_sections()
        training_examples = []

        for section in sections:
            content = section['content']

            # Extract key concepts (simple heuristic)
            sentences = content.split('.')[:5]  # First 5 sentences
            concept_text = '. '.join(sentences)

            for template in training_templates:
                example = {
                    'instruction': template['instruction'],
                    'input': concept_text,
                    'output': self.generate_simplified_response(concept_text, template['complexity'])
                }
                training_examples.append(example)

        return training_examples

    def generate_simplified_response(self, content: str, complexity: str) -> str:
        """Generate simplified responses based on complexity level"""

        # This is where you'd ideally use a teacher model to simplify
        # For now, we'll create template responses that maintain MIT structure

        if complexity == 'beginner':
            return f"""In MIT's 6.390, we approach this systematically. Let me break this down simply:

The key idea is: {content[:200]}...

Think of it this way: imagine you're teaching a friend who's never seen machine learning before. The core concept becomes much clearer when we focus on the intuition rather than just the math.

The MIT approach emphasizes understanding the 'why' before the 'how'. This builds stronger foundational knowledge."""

        elif complexity == 'socratic':
            return f"""Great question! Let's think about this together using MIT's approach.

First, what do you think is the main challenge here?

{content[:150]}...

Now, can you see why this approach makes sense? What would happen if we tried a different method?

This is exactly how we tackle problems in 6.390 - by asking the right questions first."""

        elif complexity == 'structured':
            return f"""Following MIT's structured methodology:

**Problem Definition:** {content[:100]}...

**Key Insight:** The fundamental principle here is about finding patterns in data systematically.

**Implementation:** We break this into manageable steps:
1. Understand the mathematical foundation
2. See the intuitive explanation
3. Apply to real examples

**Verification:** How do we know our approach works? MIT teaches us to always validate our understanding."""

        else:  # intermediate
            return f"""In 6.390, we balance mathematical rigor with practical understanding.

{content[:250]}...

The beauty of MIT's approach is that we don't just memorize formulas - we understand why they work. This concept connects to broader themes in machine learning that we'll see throughout the course.

Remember: every complex idea can be broken down into simpler components."""

class MITTutorTrainer:
    def __init__(self, model_name: str = "microsoft/DialoGPT-medium"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.training_data = None

    def setup_model(self):
        """Initialize model and tokenizer"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Add padding token if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )

    def prepare_dataset(self, training_examples: List[Dict]):
        """Prepare dataset for training"""

        # Format for instruction following
        formatted_data = []
        for example in training_examples:
            text = f"""### Instruction: {example['instruction']}

### Input: {example['input']}

### Response: {example['output']}"""
            formatted_data.append({'text': text})

        # Create train/validation split
        split_idx = int(0.9 * len(formatted_data))
        train_data = formatted_data[:split_idx]
        val_data = formatted_data[split_idx:]

        # Convert to Hugging Face datasets
        train_dataset = Dataset.from_list(train_data)
        val_dataset = Dataset.from_list(val_data)

        # Tokenize
        def tokenize_function(examples):
            return self.tokenizer(
                examples['text'],
                truncation=True,
                padding=True,
                max_length=512
            )

        train_dataset = train_dataset.map(tokenize_function, batched=True)
        val_dataset = val_dataset.map(tokenize_function, batched=True)

        return train_dataset, val_dataset


# Inside the MITTutorTrainer class, in the setup_lora method
# Replace the existing lora_config definition with this:

    def setup_lora(self):
        """Setup LoRA for efficient fine-tuning"""
        # Print model layer names to identify correct target modules
        print("Model named modules:")
        for name, module in self.model.named_modules():
            print(name)

        # For DialoGPT (based on GPT-2), the linear layers in attention are typically named 'c_attn'
        # However, to apply LoRA to q/k/v projections specifically within the attention
        # block, we need to target the sub-modules if they exist or target the combined layer.
        # Let's assume we want to target the primary linear layers within the attention mechanism.
        # A common target for GPT-2 models is the 'c_attn' layer within the attention block.

        # Re-define target_modules based on inspection or documentation for DialoGPT
        # If 'c_attn' is a combined layer (q, k, v), you might need to adapt how LoRA is applied
        # based on the PEFT library's capabilities for that specific module type.
        # Let's try a different common set of targets for GPT-2 like models, like the linear layers.
        # We need to check the actual layer names by printing them above.
        # Based on common GPT-2 structure, 'c_attn' is the convolutional layer for attention.
        # We need to find the actual linear layers if q_proj/v_proj aren't present.
        # Let's inspect the names printed by the loop above. A common target for GPT-2's MHA is the c_attn layer.

        # After inspecting the output of the print statement above, if 'c_attn' is present
        # within the attention modules, let's try targeting that.
        # If specific 'q_proj', 'v_proj' are not found, we might need to target the whole
        # attention block or a different set of linear layers.
        # Let's assume after inspection, we find layer names that are suitable.
        # For DialoGPT, it seems 'c_attn' within the attention layers is a common target.

        # Example of potential target modules after inspecting DialoGPT's architecture:
        # Check for names like 'attn.c_attn' or similar within the transformer blocks.
        # Let's assume 'c_attn' is the correct target based on GPT-2 architecture.

        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            # Update target_modules based on the actual layer names found in DialoGPT
            target_modules=["c_attn"] # Example target for GPT-2 based models like DialoGPT
        )

        self.model = get_peft_model(self.model, lora_config)
# Inside the MITTutorTrainer class, in the train_model method

    def train_model(self, train_dataset, val_dataset):
        """Train the model"""

        training_args = TrainingArguments(
            output_dir='./mit-tutor-model',
            num_train_epochs=3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=100,
            logging_steps=10,
            # evaluation_strategy="steps", # Old parameter name
            eval_strategy="steps",  # Corrected parameter name
            eval_steps=100,
            save_steps=500,
            learning_rate=5e-5,
            fp16=True,
            report_to=None  # Disable wandb logging
        )

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
        )

        trainer.train()
        trainer.save_model('./mit-tutor-final')

    def generate_response(self, instruction: str, input_text: str = "") -> str:
        """Generate response using the trained model"""

        prompt = f"""### Instruction: {instruction}

### Input: {input_text}

### Response:"""

        inputs = self.tokenizer(prompt, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("### Response:")[-1].strip()

# Main execution pipeline
def main():
    # Step 1: Process MIT Data
    print("Processing MIT lecture notes...")
    processor = MITDataProcessor("")

    # Download the PDF
    mit_url = "https://introml.mit.edu/_static/spring24/LectureNotes/6_390_lecture_notes_spring24.pdf"
    pdf_path = processor.download_pdf(mit_url)

    # Extract and process content
    processor.extract_pdf_text()
    training_examples = processor.create_training_examples()

    print(f"Created {len(training_examples)} training examples")

    # Save training data
    with open('mit_training_data.json', 'w') as f:
        json.dump(training_examples, f, indent=2)

    # Step 2: Setup and train model
    print("Setting up model for training...")
    trainer = MITTutorTrainer("microsoft/DialoGPT-medium")  # Using smaller model for Colab
    trainer.setup_model()
    trainer.setup_lora()

    # Prepare datasets
    train_dataset, val_dataset = trainer.prepare_dataset(training_examples)

    # Train the model
    print("Starting training...")
    trainer.train_model(train_dataset, val_dataset)

    print("Training completed! Model saved to ./mit-tutor-final")

    # Step 3: Test the model
    print("\nTesting the trained model:")
    test_response = trainer.generate_response(
        "Explain linear regression as an MIT instructor would, but keep it simple",
        "A student is confused about the mathematical foundation"
    )
    print("Response:", test_response)

# Inference-only class for deployment
class MITTutorInference:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path)

    def chat(self, user_message: str) -> str:
        """Simple chat interface"""
        instruction = "Respond as an MIT AI instructor, making complex concepts accessible to students"

        prompt = f"""### Instruction: {instruction}

### Input: {user_message}

### Response:"""

        inputs = self.tokenizer(prompt, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("### Response:")[-1].strip()

if __name__ == "__main__":
    main()

Processing MIT lecture notes...
Created 1144 training examples
Setting up model for training...
Model named modules:

transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.attn_dropout
tra



Map:   0%|          | 0/1029 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

Starting training...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter: