In [1]:
import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect("app.db")

# Query to get prompt, response, and rating, excluding rows with NULL ratings
query = """
SELECT 
    ds.section_content AS response,
    ds.prompt_used AS prompt,
    sf.rating AS rating
FROM documentation_sections ds
INNER JOIN section_feedback sf ON ds.id = sf.section_id
WHERE sf.rating IS NOT NULL;
"""

# Load into DataFrame
df = pd.read_sql_query(query, conn)
conn.close()

# Normalize ratings to a 0-1 scale
df["reward"] = df["rating"] / 5.0  # Convert ratings from 1-5 to 0-1 scale

# Save processed data
df.to_csv("rlhf_preprocessed.csv", index=False)


In [2]:
df

Unnamed: 0,response,prompt,rating,reward
0,### 1. Project Overview\n\n**Project Summary**...,You are a technical documentation expert creat...,4,0.8
1,**Technical Infrastructure**\n================...,You are a technical documentation expert creat...,3,0.6
2,### 3. Component Organization\n\n**Project Str...,You are a technical documentation expert creat...,4,0.8
3,### 4. Dependencies and Requirements\n\n**Tech...,You are a technical documentation expert creat...,5,1.0
4,### 1. Overview and Purpose\n\n**Folder Overvi...,You are a technical documentation expert creat...,4,0.8
5,### 2. Key Functions\n\n**Core Functionality**...,You are a technical documentation expert creat...,5,1.0
6,### 3. Architecture\n\n**Design Patterns**\n\n...,You are a technical documentation expert creat...,3,0.6
7,### 4. Inter-File Relationships\n\n**Component...,You are a technical documentation expert creat...,5,1.0
8,### 1. Overview and Purpose\n\n#### Folder Ove...,You are a technical documentation expert creat...,5,1.0
9,### 2. Key Functions\n\n#### 2.1 Core Function...,You are a technical documentation expert creat...,4,0.8


In [2]:
import sqlite3
import ollama
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForCausalLM,get_linear_schedule_with_warmup
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from tqdm import tqdm
from trl import PPOTrainer as TRLPPOTrainer
from torch.utils.data import Dataset
from transformers import BitsAndBytesConfig
from datasets import Dataset
import transformers
import numpy as np
from sklearn.utils import shuffle

from torch.utils.data import DataLoader
generation_kwargs = {
    "max_new_tokens": 128,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9,
}


  warn(


In [None]:
class DocumentationRLHF:
    def __init__(self, db_path='app.db'):
        self.db_path = db_path
        self._init_db()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Main device: {self.device}")

    def _init_db(self):
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                CREATE TABLE IF NOT EXISTS model_versions (
                    id INTEGER PRIMARY KEY,
                    model_name TEXT NOT NULL,
                    version TEXT NOT NULL,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)

    def _get_training_data(self):
        """Retrieve training data from joined tables"""
        conn = sqlite3.connect(self.db_path)
        query = """
            SELECT d.doc, f.feedback, f.rating
            FROM documentation d
            JOIN feedback f ON d.path = f.path AND d.level = f.level
            WHERE f.rating IS NOT NULL
            ORDER BY f.timestamp DESC
        """
        return conn.execute(query).fetchall()

    class RewardModelTrainer:
        def __init__(self, db_path):
            self.db_path = db_path
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            print(f"Initializing Reward Model on {self.device}")
            
            self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            self.model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
            
            if torch.cuda.is_available():
                self.model = self.model.to(self.device)
                
            print(f"Reward model device: {next(self.model.parameters()).device}")

        def train(self):
            data = DocumentationRLHF(self.db_path)._get_training_data()
            texts = [d for d, _, _ in data]
            ratings = torch.tensor([r/5 for _, _, r in data], 
                                dtype=torch.float32,
                                device=self.device)
            
            # Check if there's enough data
            if len(texts) == 0:
                raise ValueError("No training data found. Please ensure the database contains valid data.")
            
            optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-5, weight_decay=0.01)
            loss_fn = torch.nn.MSELoss()
            scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=50, 
                num_training_steps=3*max(1, len(texts)//16)  # Ensure at least 1 step
            )

            self.model.train()
            for epoch in range(3):
                total_loss = 0
                # Shuffle the data for each epoch
                texts_shuffled, ratings_shuffled = shuffle(texts, ratings)
                
                # Calculate the number of batches
                num_batches = max(1, len(texts_shuffled) // 16)  # Ensure at least 1 batch
                
                for i in tqdm(range(0, len(texts_shuffled))):
                    batch = self.tokenizer(
                        texts_shuffled[i:i+16],
                        padding=True,
                        truncation=True,
                        max_length=512,
                        return_tensors="pt"
                    ).to(self.device)
                    
                    outputs = self.model(**batch).logits.squeeze()
                    loss = loss_fn(outputs, ratings_shuffled[i:i+16])
                    
                    # Gradient handling
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    
                    total_loss += loss.item()
                
                # Calculate average loss
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

    class CustomPPOTrainer:
        def __init__(self, db_path, reward_model=None, reward_tokenizer=None):
            self.db_path = db_path
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            # Model initialization
            self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = "left"

            # Use bfloat16 for better stability
            torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
            
            self.model = AutoModelForCausalLMWithValueHead.from_pretrained(
                "meta-llama/Llama-3.2-1B",
                device_map="auto",
                torch_dtype=torch_dtype,
                low_cpu_mem_usage=True
            )
            self.ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
                "meta-llama/Llama-3.2-1B",
                device_map="auto",
                torch_dtype=torch_dtype,
                low_cpu_mem_usage=True
            )

            # Reward model setup
            if reward_model is None:
                self.reward_model = AutoModelForSequenceClassification.from_pretrained(
                    "saved_models/reward_model_rlhf"
                ).to(self.device)
                self.reward_tokenizer = AutoTokenizer.from_pretrained("saved_models/reward_model_rlhf")
            else:
                self.reward_model = reward_model.to(self.device)
                self.reward_tokenizer = reward_tokenizer

            self.reward_model.eval()

            # PPO Configuration with stability enhancements
            self.ppo_config = PPOConfig(
                batch_size=1,
                mini_batch_size=1,
                gradient_accumulation_steps=1,
                learning_rate=1e-6,
                vf_coef=0.1,
                kl_penalty="mse",
                adap_kl_ctrl=True,
                target_kl=1.0,
                init_kl_coef=0.2,
                max_grad_norm=1.0,
                cliprange=0.2,
                cliprange_value=0.2,
                optimize_cuda_cache=True,
                use_score_scaling=True,
                use_score_norm=True
            )

        def _calculate_rewards(self, generated_texts):
            """Calculate and normalize rewards"""
            inputs = self.reward_tokenizer(
                generated_texts,
                padding=True,
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)
            
            with torch.no_grad():
                rewards = self.reward_model(**inputs).logits.squeeze()
            
            # Normalize rewards
            rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
            return rewards.clamp(-5, 5)

        def train(self):
            data = DocumentationRLHF(self.db_path)._get_training_data()
            prompts = [
                f"Improve documentation:\nOriginal: {d}\nFeedback: {f}\nImproved:"
                for d, f, _ in data
            ]

            # Filter and clean prompts
            prompts = [p for p in prompts if len(p) > 10]

            # Tokenize the prompts
            def tokenize_function(examples):
                return self.tokenizer(
                    examples["query"],
                    padding="max_length",
                    truncation=True,
                    max_length=512,
                    return_tensors="pt"  # Return PyTorch tensors directly
                )

            # Create a Dataset object from the prompts
            dataset = Dataset.from_dict({"query": prompts})

            # Tokenize the dataset
            tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=1)

            # Initialize PPOTrainer with the tokenized dataset
            ppo_trainer = PPOTrainer(
                config=self.ppo_config,
                model=self.model,
                ref_model=self.ref_model,
                tokenizer=self.tokenizer,
                dataset=tokenized_dataset  # Pass the tokenized dataset
            )

            generation_kwargs = {
                "do_sample": True,
                "temperature": 0.8,
                "top_p": 0.92,
                "top_k": 50,
                "max_new_tokens": 256,
                "pad_token_id": self.tokenizer.eos_token_id,
                "repetition_penalty": 1.15,
                "no_repeat_ngram_size": 3
            }

            for epoch in range(3):
                for batch in tqdm(ppo_trainer.dataloader, desc=f"PPO Epoch {epoch+1}"):
                    try:
                        
                        # Move batch tensors to the correct device
                        query_tensors = batch["input_ids"]
                        attention_mask = batch["attention_mask"]

                        # Generate responses
                        response_tensors = ppo_trainer.generate(
                            query_tensor=query_tensors,
                            return_prompt=False,
                            **generation_kwargs
                        )

                        # Decode responses
                        generated_texts = [
                            self.tokenizer.decode(r, skip_special_tokens=True)
                            for r in response_tensors
                        ]

                        # Calculate rewards
                        rewards = self._calculate_rewards(generated_texts)

                        # PPO step with stability checks
                        stats = ppo_trainer.step(
                            query_tensors,
                            response_tensors,
                            rewards
                        )
                        
                        # Check for NaN and reset if needed
                        if torch.isnan(torch.tensor(list(stats.values()))).any():
                            print("NaN detected in stats - resetting gradients")
                            ppo_trainer.optimizer.zero_grad()
                            
                    except RuntimeError as e:
                        if 'nan' in str(e).lower():
                            print("NaN detected - skipping batch")
                            continue
                        raise

    

    def log_documentation(self, user_id: str, path: str, doc: str, prompt: str,
                          project_name: str, level: str, root_path: str):
        """Log generated documentation in the database"""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                INSERT INTO documentation
                (user_id, path, doc, prompt, project_name, level, root_path)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (user_id, path, doc, prompt, project_name, level, root_path))

    def add_feedback(self, user_id: str, path: str, level: str, feedback: str, rating: int):
        """Add user feedback to the database"""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                INSERT INTO feedback
                (user_id, path, level, feedback, rating)
                VALUES (?, ?, ?, ?, ?)
            """, (user_id, path, level, feedback, rating))

    def full_training_pipeline(self):
        """Complete training workflow with model saving."""
        # 1. Train reward model
        #reward_trainer = self.RewardModelTrainer(self.db_path)
        #reward_trainer.train()
        
        # Save the trained reward model along with its tokenizer
        reward_model_save_path = "saved_models/reward_model_rlhf"
        #reward_trainer.model.save_pretrained(reward_model_save_path)
        #reward_trainer.tokenizer.save_pretrained(reward_model_save_path)
        #print(f"Reward model and tokenizer saved to {reward_model_save_path}")
    
        # 2. Train with PPO
        # Load saved reward model and tokenizer
        reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_save_path)
        reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_save_path)
        
        # Pass the stored reward model and tokenizer to the PPO trainer
        ppo_trainer = self.CustomPPOTrainer(self.db_path, reward_model=reward_model, reward_tokenizer=reward_tokenizer)
        ppo_trainer.train()
        
        # Save the PPO model
        ppo_model_save_path = "saved_models/ppo_model_rlhf"
        ppo_trainer.model.save_pretrained(ppo_model_save_path)
        ppo_trainer.tokenizer.save_pretrained(ppo_model_save_path)
        print(f"PPO model saved to {ppo_model_save_path}")
    
        # 3. Log new model version
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                INSERT INTO model_versions (model_name, version)
                VALUES ('documentation_generator', 'rlhf-v1')
            """)
            conn.commit()
        print("Logged new model version to the database.")

In [4]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [5]:
if __name__ == "__main__":
    rlhf = DocumentationRLHF()
    print(rlhf.device)
   
    # Run training
    rlhf.full_training_pipeline()

Main device: cuda
cuda


  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Main device: cuda


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PPO Epoch 1:   0%|          | 0/7 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
PPO Epoch 1:   0%|          | 0/7 [23:53<?, ?it/s]


ValueError: Batch size (1) does not match number of examples - but got 512 for: queries

In [None]:
# Load tokenizer
model_path = "saved_models/ppo_model_rlhf"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

quant_config = BitsAndBytesConfig(
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                llm_int8_enable_fp32_cpu_offload=False
            )

# Load RLHF-trained model with explicit quantization type
model = AutoModelForCausalLM.from_pretrained(
    model_path
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate text
def generate_response(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the trained model
prompt = "What is the importance of AI in healthcare?"
response = generate_response(prompt)

print("Generated Response:\n", response)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of the model checkpoint at saved_models/ppo_model_rlhf were not used when initializing LlamaForCausalLM: {'v_head.summary.weight', 'v_head.summary.bias'}
- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Generated Response:
 What is the importance of AI in healthcare?!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
import json

model_path = "saved_models/ppo_model_rlhf/config.json"

with open(model_path, "r") as f:
    config = json.load(f)

print(config.get("quantization_config", "No quantization config found"))


{'_load_in_4bit': True, '_load_in_8bit': False, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_storage': 'uint8', 'bnb_4bit_quant_type': 'fp4', 'bnb_4bit_use_double_quant': False, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'llm_int8_skip_modules': None, 'llm_int8_threshold': 6.0, 'load_in_4bit': True, 'load_in_8bit': False, 'quant_method': 'bitsandbytes'}
