# Optimized Fine-Tuning Uncensored CodeGemma 7B with GCS Streaming for Kaggle

## Overview
Fine-tunes `ICEPVP8977/Uncensored_codegemma_7b` using all specified HF datasets (18 sources merged). Uses Unsloth + 4-bit for low VRAM (~4-6GB). **Streams all datasets and models to/from GCS buckets** for Kaggle compatibility.

**Datasets:** Full merge (~500k-700k examples, ~1.5-2.5GB); streamed from GCS.
**Storage:** All data streamed to/from GCS buckets - no local storage limits.
**Time:** 1-2h sampled / 4-8h full on T4 GPU.
**Output:** Streamed to GCS bucket as `fine_tuned_model_f16.gguf` (~14GB).

**GCS Buckets:**
- Models: `gs://wizard-coder-ai-models-1759403941/`
- Datasets: `gs://wizard-coder-datasets-1759403954/`

## 1. Install Dependencies (Unsloth-Optimized + GCS Streaming)

In [None]:
# Unsloth for 2-5x faster, 80% less VRAM
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q xformers trl peft accelerate bitsandbytes
!pip install -q transformers==4.44.2 datasets==2.21.0 huggingface_hub==0.24.6 safetensors==0.4.5

# GCS streaming support
!pip install -q google-cloud-storage gcsfs

# GGUF conversion
!pip install -q llama-cpp-python
!git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp 2>/dev/null || echo "llama.cpp already cloned"
%cd /tmp/llama.cpp
!make clean && make -j
%cd /kaggle/working

from unsloth import FastLanguageModel
import torch
import os
print(f"CUDA: {torch.cuda.is_available()}, VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
!nvidia-smi
!df -h  # Disk usage

## 2. Setup GCS Authentication and Buckets

In [None]:
from google.cloud import storage
import json
import os

# GCS Configuration
MODELS_BUCKET = 'wizard-coder-ai-models-1759403941'
DATASETS_BUCKET = 'wizard-coder-datasets-1759403954'
PROJECT_ID = 'wizardlm-vertex-1759276927'

# Authenticate with GCS (using Kaggle's built-in auth or service account)
try:
    # Try using default credentials first
    client = storage.Client(project=PROJECT_ID)
    print(f"✅ Authenticated with GCS project: {PROJECT_ID}")
except Exception as e:
    print(f"❌ GCS auth failed: {e}")
    print("Please ensure you're running in Kaggle with GCS access enabled")
    # Alternative: Use service account key if uploaded as Kaggle dataset
    # os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/kaggle/input/your-key-dataset/key.json'
    # client = storage.Client(project=PROJECT_ID)

# Verify buckets exist
try:
    models_bucket = client.bucket(MODELS_BUCKET)
    datasets_bucket = client.bucket(DATASETS_BUCKET)
    print(f"✅ Models bucket: gs://{MODELS_BUCKET}")
    print(f"✅ Datasets bucket: gs://{DATASETS_BUCKET}")
except Exception as e:
    print(f"❌ Bucket access failed: {e}")
    print("Creating buckets...")
    try:
        models_bucket = client.create_bucket(MODELS_BUCKET)
        datasets_bucket = client.create_bucket(DATASETS_BUCKET)
        print("✅ Buckets created successfully")
    except Exception as create_e:
        print(f"❌ Bucket creation failed: {create_e}")

## 3. Login to Hugging Face (for Gated Datasets)

In [None]:
from huggingface_hub import login
login()  # Paste HF token (e.g., hf_XXX)

## 4. GCS Streaming Dataset Manager

In [None]:
import tempfile
import shutil
from datasets import load_dataset, concatenate_datasets, Dataset
from tqdm import tqdm
import numpy as np
import pickle
import io

class GCSDatasetManager:
    def __init__(self, client, datasets_bucket_name):
        self.client = client
        self.datasets_bucket = client.bucket(datasets_bucket_name)
    
    def save_dataset_to_gcs(self, dataset, name):
        """Save dataset to GCS bucket"""
        print(f"📤 Uploading dataset '{name}' to GCS...")
        
        # Create temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            local_path = os.path.join(temp_dir, name)
            dataset.save_to_disk(local_path)
            
            # Upload all files to GCS
            for root, dirs, files in os.walk(local_path):
                for file in files:
                    local_file_path = os.path.join(root, file)
                    relative_path = os.path.relpath(local_file_path, temp_dir)
                    gcs_path = f"{name}/{relative_path}"
                    
                    blob = self.datasets_bucket.blob(gcs_path)
                    blob.upload_from_filename(local_file_path)
                    
        print(f"✅ Dataset '{name}' uploaded to gs://{self.datasets_bucket.name}/{name}/")
    
    def load_dataset_from_gcs(self, name):
        """Load dataset from GCS bucket"""
        print(f"📥 Downloading dataset '{name}' from GCS...")
        
        # Create temporary directory
        temp_dir = tempfile.mkdtemp()
        local_path = os.path.join(temp_dir, name)
        
        # Download all files from GCS
        blobs = self.datasets_bucket.list_blobs(prefix=f"{name}/")
        for blob in blobs:
            if blob.name.endswith('/'):  # Skip directory markers
                continue
                
            relative_path = blob.name[len(f"{name}/"):]
            local_file_path = os.path.join(local_path, relative_path)
            
            # Create directory if needed
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            
            blob.download_to_filename(local_file_path)
        
        # Load dataset
        dataset = Dataset.load_from_disk(local_path)
        
        # Cleanup
        shutil.rmtree(temp_dir)
        
        print(f"✅ Dataset '{name}' loaded from GCS")
        return dataset
    
    def dataset_exists_in_gcs(self, name):
        """Check if dataset exists in GCS"""
        blobs = list(self.datasets_bucket.list_blobs(prefix=f"{name}/"))
        return len(blobs) > 0

# Initialize dataset manager
dataset_manager = GCSDatasetManager(client, DATASETS_BUCKET)

## 5. Load and Stream Datasets to GCS (All 18 Sources)

In [None]:
reference_columns = ['instruction', 'input', 'output']  # Standard for alignment

def load_and_stream_datasets():
    """Load all datasets and stream them to GCS"""
    
    # Check if merged dataset already exists in GCS
    if dataset_manager.dataset_exists_in_gcs('merged_uncensored_alpaca'):
        print("📥 Loading pre-merged Alpaca dataset from GCS...")
        alpaca_merged = dataset_manager.load_dataset_from_gcs('merged_uncensored_alpaca')
    else:
        print("📤 Creating and uploading Alpaca dataset to GCS...")
        
        # Exact Step 1: Alpaca uncensored merge (your code verbatim)
        dataset_paths = [
            "V3N0M/Jenna-50K-Alpaca-Uncensored",
            "SaisExperiments/Alpaca-Uncensored",
            "SaisExperiments/Big-Alpaca-Uncensored",
            "xzuyn/open-instruct-uncensored-alpaca",
            "xzuyn/tulu-uncensored-alpaca",
            "xzuyn/tv-alpaca-open-instruct-uncensored-blend",
            "dim/dolphin_flan1m_alpaca_uncensored_3k",
            "dataautogpt3/flan1m-alpaca-uncensored",
            "ShubhVenom/Uncensored-Alpaca-v01",
            "V3N0M/Uncensored-Alpaca",
            "Xennon-BD/Alpaca-uncensored",
            "VinyVan/flanMini-alpaca-uncensored_bambara"
        ]

        # Load the first dataset to get reference columns
        dataset1 = load_dataset(dataset_paths[0], split="train")
        reference_columns = dataset1.column_names  # Dynamic from first

        # Load and select columns for the remaining datasets
        datasets = [dataset1]
        for path in tqdm(dataset_paths[1:], desc="Alpaca Loading"):
            dataset = load_dataset(path, split="train")
            dataset = dataset.select_columns(reference_columns)
            datasets.append(dataset)

        # Merge all datasets
        alpaca_merged = concatenate_datasets(datasets)
        print(f"Alpaca lines: {len(alpaca_merged)}")
        
        # Stream to GCS
        dataset_manager.save_dataset_to_gcs(alpaca_merged, 'merged_uncensored_alpaca')

    # Load additional datasets and stream to GCS
    datasets_to_load = [
        ("mrcuddle/airoboros-uncensored", "airoboros_uncensored"),
        ("mrcuddle/airoboros-uncensored-conversation", "airoboros_conversation"),
        ("open-llm-leaderboard/details_ehartford__WizardLM-1.0-Uncensored-CodeLlama-34b", "leaderboard_winogrande", "harness_winogrande_5"),
        ("open-llm-leaderboard/DevQuasar__DevQuasar-R1-Uncensored-Llama-8B-details", "leaderboard_date_understanding", "DevQuasar__DevQuasar-R1-Uncensored-Llama-8B__leaderboard_bbh_date_understanding"),
        ("open-llm-leaderboard/DevQuasar__DevQuasar-R1-Uncensored-Llama-8B-details", "leaderboard_causal_judgement", "DevQuasar__DevQuasar-R1-Uncensored-Llama-8B__leaderboard_bbh_causal_judgement"),
        ("open-llm-leaderboard/DevQuasar__DevQuasar-R1-Uncensored-Llama-8B-details", "leaderboard_boolean_expressions", "DevQuasar__DevQuasar-R1-Uncensored-Llama-8B__leaderboard_bbh_boolean_expressions")
    ]
    
    loaded_datasets = [alpaca_merged]
    
    for dataset_info in datasets_to_load:
        if len(dataset_info) == 2:
            repo_name, gcs_name = dataset_info
            subset_name = None
        else:
            repo_name, gcs_name, subset_name = dataset_info
        
        if dataset_manager.dataset_exists_in_gcs(gcs_name):
            print(f"📥 Loading {gcs_name} from GCS...")
            dataset = dataset_manager.load_dataset_from_gcs(gcs_name)
        else:
            print(f"📤 Loading and uploading {gcs_name} to GCS...")
            
            if subset_name:
                dataset = load_dataset(repo_name, subset_name, split="train")
            else:
                dataset = load_dataset(repo_name, split="train")
            
            # Process dataset based on type
            if 'airoboros' in gcs_name:
                if 'conversation' in gcs_name:
                    # Flatten conversations
                    if 'conversations' in dataset.column_names:
                        def flatten(ex):
                            instr = ' '.join([t['value'] for t in ex['conversations'] if t['from'] == 'human'])
                            out = ' '.join([t['value'] for t in ex['conversations'] if t['from'] == 'gpt'])
                            return {'instruction': instr, 'input': '', 'output': out}
                        dataset = dataset.map(flatten)
                else:
                    # Rename columns for airoboros uncensored
                    if 'prompt' in dataset.column_names and 'instruction' not in dataset.column_names:
                        dataset = dataset.rename_column('prompt', 'instruction')
                    if 'completion' in dataset.column_names and 'output' not in dataset.column_names:
                        dataset = dataset.rename_column('completion', 'output')
            
            elif 'leaderboard' in gcs_name:
                # Map leaderboard format
                def map_lb(ex):
                    return {'instruction': ex.get('question', ''), 'input': ex.get('context', ''), 'output': ex.get('answer', '')}
                dataset = dataset.map(map_lb)
            
            # Align columns and filter
            dataset = dataset.select_columns(reference_columns)
            dataset = dataset.filter(lambda ex: all(ex.get(col, '') for col in reference_columns))
            
            # Stream to GCS
            dataset_manager.save_dataset_to_gcs(dataset, gcs_name)
        
        loaded_datasets.append(dataset)
        print(f"✅ {gcs_name}: {len(dataset)} examples")
    
    return loaded_datasets, reference_columns

# Load all datasets
try:
    all_datasets, reference_columns = load_and_stream_datasets()
    
    # Merge all datasets
    final_dataset = concatenate_datasets(all_datasets)
    final_dataset = final_dataset.filter(lambda ex: ex['output'] and len(ex['output']) > 10)
    final_dataset = final_dataset.shuffle(seed=42)

    # Sample 10% (quality preserved; uncomment for full)
    sample_size = int(len(final_dataset) * 0.1)
    final_dataset = final_dataset.select(range(sample_size))
    # final_dataset = final_dataset  # Full for better quality

    print(f"Final merged (all datasets): {len(final_dataset)} examples (~{len(final_dataset)/1000:.1f}k)")
    
    # Stream final dataset to GCS
    dataset_manager.save_dataset_to_gcs(final_dataset, 'final_merged_dataset')
    
    !df -h  # Check disk

except Exception as e:
    print(f"Error loading datasets: {e}. Using fallback.")
    final_dataset = load_dataset("tatsu-lab/alpaca", split="train[:5000]")  # Small fallback

## 6. Load Model and Setup Fine-Tuning

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

# Model configuration
model_name = "ICEPVP8977/Uncensored_codegemma_7b"
max_seq_length = 2048  # Adjust based on your needs
dtype = None  # Auto-detect
load_in_4bit = True  # For low VRAM

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print(f"✅ Model loaded: {model_name}")
print(f"VRAM usage: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
!nvidia-smi

## 7. Prepare Dataset for Training

In [None]:
def formatting_prompts_func(examples):
    """Format examples for training"""
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        if input_text:
            text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        else:
            text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
        texts.append(text)
    return {"text": texts}

# Format dataset
final_dataset = final_dataset.map(formatting_prompts_func, batched=True)

# Split dataset
train_dataset = final_dataset.select(range(int(len(final_dataset) * 0.9)))
eval_dataset = final_dataset.select(range(int(len(final_dataset) * 0.9), len(final_dataset)))

print(f"Training examples: {len(train_dataset)}")
print(f"Evaluation examples: {len(eval_dataset)}")
print(f"Sample formatted text: {train_dataset[0]['text'][:200]}...")

## 8. Train Model with Streaming Checkpoints to GCS

In [None]:
class GCSModelManager:
    def __init__(self, client, models_bucket_name):
        self.client = client
        self.models_bucket = client.bucket(models_bucket_name)
    
    def upload_model_to_gcs(self, local_path, gcs_path):
        """Upload model files to GCS"""
        print(f"📤 Uploading model to gs://{self.models_bucket.name}/{gcs_path}...")
        
        for root, dirs, files in os.walk(local_path):
            for file in files:
                local_file_path = os.path.join(root, file)
                relative_path = os.path.relpath(local_file_path, local_path)
                blob_path = f"{gcs_path}/{relative_path}"
                
                blob = self.models_bucket.blob(blob_path)
                blob.upload_from_filename(local_file_path)
        
        print(f"✅ Model uploaded to gs://{self.models_bucket.name}/{gcs_path}/")
    
    def download_model_from_gcs(self, gcs_path, local_path):
        """Download model files from GCS"""
        print(f"📥 Downloading model from gs://{self.models_bucket.name}/{gcs_path}...")
        
        os.makedirs(local_path, exist_ok=True)
        
        blobs = self.models_bucket.list_blobs(prefix=gcs_path)
        for blob in blobs:
            if blob.name.endswith('/'):  # Skip directory markers
                continue
                
            relative_path = blob.name[len(gcs_path):].lstrip('/')
            local_file_path = os.path.join(local_path, relative_path)
            
            # Create directory if needed
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            
            blob.download_to_filename(local_file_path)
        
        print(f"✅ Model downloaded to {local_path}")

# Initialize model manager
model_manager = GCSModelManager(client, MODELS_BUCKET)

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=100,  # Adjust based on your needs
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="/kaggle/working/fine_tuned_model",
    save_steps=50,  # Save checkpoints every 50 steps
    save_total_limit=2,  # Keep only 2 checkpoints
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
)

# Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=training_args,
)

print("🚀 Starting training...")
trainer.train()

# Upload final model to GCS
model_manager.upload_model_to_gcs("/kaggle/working/fine_tuned_model", "fine_tuned_model")

print("✅ Training completed and model uploaded to GCS!")

## 9. Convert to GGUF and Stream to GCS

In [None]:
# Save model in Hugging Face format
FastLanguageModel.for_training(model)
model.save_pretrained("/kaggle/working/final_model")
tokenizer.save_pretrained("/kaggle/working/final_model")

# Upload HF format to GCS
model_manager.upload_model_to_gcs("/kaggle/working/final_model", "final_model_hf")

# Convert to GGUF
print("🔄 Converting to GGUF format...")
%cd /tmp/llama.cpp

# Convert to GGUF
!python convert_hf_to_gguf.py /kaggle/working/final_model --outfile /kaggle/working/fine_tuned_model_f16.gguf --outtype f16

# Upload GGUF to GCS
print("📤 Uploading GGUF model to GCS...")
blob = model_manager.models_bucket.blob("fine_tuned_model_f16.gguf")
blob.upload_from_filename("/kaggle/working/fine_tuned_model_f16.gguf")

print(f"✅ GGUF model uploaded to gs://{MODELS_BUCKET}/fine_tuned_model_f16.gguf")

# Check file sizes
!ls -lh /kaggle/working/fine_tuned_model_f16.gguf
!df -h

## 10. Test Model and Cleanup

In [None]:
# Test the model
FastLanguageModel.for_inference(model)

# Test prompt
test_prompt = "Write a Python function to calculate fibonacci numbers:"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("🤖 Model Response:")
    print(response)

# Summary
print("\n📊 Summary:")
print(f"✅ Datasets streamed to: gs://{DATASETS_BUCKET}/")
print(f"✅ Models streamed to: gs://{MODELS_BUCKET}/")
print(f"✅ Final GGUF model: gs://{MODELS_BUCKET}/fine_tuned_model_f16.gguf")
print(f"✅ Training completed with {len(train_dataset)} examples")

# Cleanup local files to save space
!rm -rf /kaggle/working/fine_tuned_model
!rm -rf /kaggle/working/final_model
!rm -rf /kaggle/working/merged_dataset
!df -h

## 11. Check AI Compute Quota

In [None]:
# Check Vertex AI compute quota
try:
    from google.cloud import aiplatform
    
    # Initialize AI Platform
    aiplatform.init(project=PROJECT_ID, location="us-central1")
    
    print("🔍 Checking Vertex AI compute quotas...")
    
    # Note: Quota checking requires specific permissions
    # This is a placeholder - actual quota checking would require
    # the Cloud Resource Manager API or specific quota APIs
    
    print("📋 To check your AI compute quota:")
    print("1. Go to Google Cloud Console")
    print("2. Navigate to IAM & Admin > Quotas")
    print("3. Filter by 'Vertex AI' or 'AI Platform'")
    print("4. Check your current usage vs limits")
    
    print("\n💡 Common quotas to check:")
    print("- Vertex AI Training GPU hours")
    print("- Vertex AI Prediction requests")
    print("- Compute Engine GPU instances")
    print("- Cloud Storage operations")
    
except Exception as e:
    print(f"❌ Could not check quotas: {e}")
    print("Please check quotas manually in Google Cloud Console")

# Alternative: Check current resource usage
print("\n📊 Current Resource Usage:")
!nvidia-smi
!df -h
!free -h

## 12. Download Instructions for Kaggle

In [None]:
print("📥 Instructions for downloading models in Kaggle:")
print("\n1. Install gcloud CLI in Kaggle:")
print("   !pip install google-cloud-storage")
print("   !pip install gcsfs")

print("\n2. Authenticate (use service account key):")
print("   import os")
print("   os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/kaggle/input/your-key/key.json'")

print("\n3. Download model:")
print(f"   from google.cloud import storage")
print(f"   client = storage.Client()")
print(f"   bucket = client.bucket('{MODELS_BUCKET}')")
print(f"   blob = bucket.blob('fine_tuned_model_f16.gguf')")
print(f"   blob.download_to_filename('/kaggle/working/model.gguf')")

print("\n4. Download datasets:")
print(f"   # Use the GCSDatasetManager class from this notebook")
print(f"   # to download datasets from gs://{DATASETS_BUCKET}/")

print("\n✅ All models and datasets are now available in your GCS buckets!")
print(f"📦 Models: gs://{MODELS_BUCKET}/")
print(f"📊 Datasets: gs://{DATASETS_BUCKET}/")

# Create a simple download script for Kaggle
download_script = f"""
# Kaggle Download Script
import os
from google.cloud import storage

# Set up authentication
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/kaggle/input/your-key/key.json'

# Initialize client
client = storage.Client()
models_bucket = client.bucket('{MODELS_BUCKET}')
datasets_bucket = client.bucket('{DATASETS_BUCKET}')

# Download GGUF model
blob = models_bucket.blob('fine_tuned_model_f16.gguf')
blob.download_to_filename('/kaggle/working/fine_tuned_model_f16.gguf')
print('✅ Model downloaded')

# Download datasets (example)
blob = datasets_bucket.blob('final_merged_dataset/...')
# Use GCSDatasetManager for full dataset download
print('✅ Ready for fine-tuning in Kaggle!')
"""

with open('/kaggle/working/kaggle_download_script.py', 'w') as f:
    f.write(download_script)

print("\n📄 Created kaggle_download_script.py for easy Kaggle setup")