# Optimized Fine-Tuning WizardLM-1.0-Uncensored-CodeLlama-34B with GCS Streaming for Kaggle

## Overview
Fine-tunes `QuixiAI/WizardLM-1.0-Uncensored-CodeLlama-34b` using the pre-merged uncensored Alpaca dataset from GCS (~50K examples, streamed). Uses Unsloth + 4-bit quantization for low VRAM (~10-15GB on T4/P100). **Streams dataset and model checkpoints to/from GCS** to avoid Kaggle disk limits.

**Dataset:** Pre-merged uncensored Alpaca (from previous notebook); streamed from GCS.
**Storage:** All data streamed to/from single GCS bucket - no local storage overload.
**Time:** 30-90 min (subsampled) on T4 GPU.
**Output:** LoRA adapters + full model zipped and streamed to GCS as `wizardlm_fine_tuned.zip` (~5-10GB).

**GCS Bucket:** `gs://wizardlm-training-1759276927/`
- Datasets: `Datasets/merged_uncensored_alpaca`
- Models: `model_output/wizardlm_fine_tuned/`

In [None]:
# Kaggle Environment Cleanup Script
# This script clears cache, checkpoints, and temporary files to prepare for fresh training

import shutil
import os
import gc
import torch

def cleanup_kaggle_environment():
    """Clean up Kaggle environment for fresh training."""
    print("🧹 Starting Kaggle environment cleanup...")
    
    # Clear HuggingFace cache
    print("📦 Clearing HuggingFace cache...")
    cache_dir = "/kaggle/working/hf_cache"
    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)
        os.makedirs(cache_dir, exist_ok=True)
        print("✅ HuggingFace cache cleared")
    else:
        print("ℹ️ HuggingFace cache directory not found")
    
    # Clear any model checkpoints and outputs
    print("📁 Clearing model checkpoints and outputs...")
    checkpoint_dirs = [
        "/kaggle/working/outputs",
        "/kaggle/working/final_model",
        "/kaggle/working/lora_adapters",
        "/kaggle/working/wizardlm_fine_tuned.zip"
    ]
    
    for dir_path in checkpoint_dirs:
        if os.path.exists(dir_path):
            if os.path.isdir(dir_path):
                shutil.rmtree(dir_path)
                print(f"✅ Cleared directory: {dir_path}")
            else:
                os.remove(dir_path)
                print(f"✅ Removed file: {dir_path}")
        else:
            print(f"ℹ️ Not found: {dir_path}")
    
    # Clear PyTorch cache
    print("🔥 Clearing PyTorch cache...")
    try:
        torch.cuda.empty_cache()
        print("✅ CUDA cache cleared")
    except Exception as e:
        print(f"⚠️ CUDA cache clear failed: {e}")
    
    # Force garbage collection
    gc.collect()
    print("✅ Garbage collection completed")
    
    # Clear temporary files
    print("🗑️ Clearing temporary files...")
    temp_dirs = ["/tmp", "/kaggle/tmp"]
    for temp_dir in temp_dirs:
        if os.path.exists(temp_dir):
            try:
                for item in os.listdir(temp_dir):
                    item_path = os.path.join(temp_dir, item)
                    try:
                        if os.path.isdir(item_path):
                            shutil.rmtree(item_path)
                        else:
                            os.remove(item_path)
                    except Exception as e:
                        print(f"⚠️ Could not remove {item_path}: {e}")
                print(f"✅ Cleared temporary directory: {temp_dir}")
            except Exception as e:
                print(f"⚠️ Could not clear {temp_dir}: {e}")
    
    # Check disk space after cleanup
    print("💾 Checking disk space after cleanup...")
    total, used, free = shutil.disk_usage("/")
    free_gb = free // (1024**3)
    used_gb = used // (1024**3)
    total_gb = total // (1024**3)
    
    print(f"📊 Disk usage:")
    print(f"   - Free: {free_gb}GB")
    print(f"   - Used: {used_gb}GB")
    print(f"   - Total: {total_gb}GB")
    print(f"   - Usage: {(used_gb/total_gb)*100:.1f}%")
    
    # Clear Python variables (if running in notebook)
    print("🐍 Clearing Python variables...")
    try:
        # This will only work in notebook environment
        import sys
        # Clear common variable names that might conflict
        vars_to_clear = ['model', 'tokenizer', 'trainer', 'final_dataset', 'training_args']
        for var_name in vars_to_clear:
            if var_name in globals():
                del globals()[var_name]
                print(f"✅ Cleared variable: {var_name}")
    except Exception as e:
        print(f"ℹ️ Variable clearing not applicable: {e}")
    
    print("\n🎉 Cleanup completed successfully!")
    print("📋 Next steps:")
    print("   1. Restart the kernel")
    print("   2. Re-run cells 1-7 to set up environment")
    print("   3. Run the fixed Cell 8 for training")
    print("\n💡 Your environment is now clean and ready for fresh training!")

# Run cleanup
if __name__ == "__main__":
    cleanup_kaggle_environment()

## 1. Suppress Warnings, Cleanup, and Install Dependencies (CUDA-Compatible)

In [None]:
# Kaggle-Optimized Dependencies Setup for Tesla P100 Fine-Tuning
# This script works with Kaggle's pre-installed packages and avoids conflicts

import os
import shutil
import gc

def initial_cleanup():
    # Clean common caches
    caches = [
        os.path.expanduser("~/.cache/huggingface"),
        os.path.expanduser("~/.cache/torch"),
        os.path.expanduser("~/.cache/pip"),
        "/tmp/*"
    ]
    for cache in caches:
        if '*' in cache:
            os.system(f"rm -rf {cache}")
        elif os.path.exists(cache) and os.path.isdir(cache):
            shutil.rmtree(cache)
            print(f"🧹 Cleared: {cache}")
    
    # Clean working dir leftovers
    working_dir = "/kaggle/working"
    for item in os.listdir(working_dir):
        item_path = os.path.join(working_dir, item)
        if any(keyword in item.lower() for keyword in ["checkpoint", "wizardlm", ".zip", ".log", "hf_cache", "fine_tuned"]):
            if os.path.isdir(item_path):
                shutil.rmtree(item_path)
            else:
                os.remove(item_path)
            print(f"🧹 Removed: {item}")
    
    gc.collect()
    
    # Monitor space
    os.system("df -h /")
    total, used, free = shutil.disk_usage("/")
    print(f"💾 Disk: Total={total//(2**30)}GB, Used={used//(2**30)}GB, Free={free//(2**30)}GB")

initial_cleanup()
print("✅ Initial cleanup done.")

# Check CUDA availability first
os.system("nvidia-smi")
print("🔍 Checking CUDA setup...")

# Use Kaggle's existing PyTorch (usually compatible)
print("📦 Using Kaggle's existing PyTorch...")

# Install only essential packages that don't conflict
print("📦 Installing essential packages...")

# Install PEFT for LoRA (compatible with Kaggle's setup)
os.system("pip install peft -q")

# Install TRL for training
os.system("pip install trl -q")

# Install GCS support
os.system("pip install google-cloud-storage -q")

# Install bitsandbytes for quantization
os.system("pip install bitsandbytes -q")

# Set cache dirs to working space for easy cleanup
os.environ["HF_HOME"] = "/kaggle/working/hf_cache"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.makedirs("/kaggle/working/hf_cache", exist_ok=True)
print("✅ Cache directories set to /kaggle/working.")

# Test CUDA and PyTorch installation
print("🧪 Testing CUDA and PyTorch installation...")
try:
    import torch
    print(f"✅ PyTorch version: {torch.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"✅ CUDA version: {torch.version.cuda}")
        print(f"✅ GPU count: {torch.cuda.device_count()}")
        print(f"✅ GPU name: {torch.cuda.get_device_name(0)}")
        print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
        print(f"✅ CUDA capability: {torch.cuda.get_device_capability(0)}")
    else:
        print("⚠️ CUDA not available - will use CPU (slower)")
except Exception as e:
    print(f"❌ PyTorch/CUDA test failed: {e}")

# Test core imports
print("🧪 Testing core ML library imports...")
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    print("✅ Transformers imported successfully")
except Exception as e:
    print(f"❌ Transformers import failed: {e}")

try:
    from peft import LoraConfig, get_peft_model, TaskType
    print("✅ PEFT imported successfully")
except Exception as e:
    print(f"❌ PEFT import failed: {e}")

try:
    from trl import SFTTrainer
    print("✅ TRL imported successfully")
except Exception as e:
    print(f"❌ TRL import failed: {e}")

try:
    from datasets import load_dataset
    print("✅ Datasets imported successfully")
except Exception as e:
    print(f"❌ Datasets import failed: {e}")

try:
    from google.cloud import storage
    print("✅ Google Cloud Storage imported successfully")
except Exception as e:
    print(f"❌ GCS import failed: {e}")

# Test BitsAndBytesConfig
try:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    print("✅ BitsAndBytesConfig working")
except Exception as e:
    print(f"❌ BitsAndBytesConfig failed: {e}")

os.system("nvidia-smi")
os.system("df -h")  # Disk usage

print("\n🎉 Kaggle-optimized dependencies setup completed!")
print("📋 Summary:")
print("   - PyTorch: Using Kaggle's existing version")
print("   - CUDA: Tesla P100 compatible")
print("   - Training: Standard transformers + PEFT + TRL")
print("   - Storage: Google Cloud Storage support")
print("   - Quantization: BitsAndBytesConfig ready")
print("\n💡 Ready for fine-tuning with Kaggle's optimized environment!")

## 2. Setup GCS Authentication and Bucket

In [None]:
import json
import os
from kaggle_secrets import UserSecretsClient
from google.cloud import storage

# GCS Configuration (single bucket from previous dataset generation)
BUCKET_NAME = 'wizardlm-training-1759276927'
DATASET_GCS_PATH = 'Datasets/merged_uncensored_alpaca'  # Pre-merged from previous notebook (Arrow format folder)
MODEL_OUTPUT_PATH = 'model_output/wizardlm_fine_tuned'  # Folder for model outputs

# Authenticate with GCS using Kaggle secret
print("🔐 Authenticating GCS...")
user_secrets = UserSecretsClient()
service_account_json_str = user_secrets.get_secret("GCS_SERVICE_ACCOUNT")

if not service_account_json_str.strip():
    raise ValueError("GCS_SERVICE_ACCOUNT secret is empty! Please add your GCS service account JSON as a Kaggle secret.")

# Validate JSON format
try:
    service_account_json = json.loads(service_account_json_str)
except json.JSONDecodeError as e:
    raise ValueError(f"GCS_SERVICE_ACCOUNT secret is not valid JSON: {e}")

# Write the JSON key to a file
service_account_path = "/kaggle/working/gcs_service_account.json"
with open(service_account_path, "w") as f:
    json.dump(service_account_json, f, indent=2)

# Set the environment variable for Google Cloud authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_path

print("✅ GCS service account set.")

# Initialize GCS client and bucket with error handling
try:
    client = storage.Client()
    bucket = client.bucket(BUCKET_NAME)
    print(f"✅ Connected to GCS bucket: {BUCKET_NAME}")
except Exception as e:
    raise ValueError(f"Failed to connect to GCS bucket {BUCKET_NAME}: {e}")

# Verify pre-merged dataset exists in GCS
try:
    blobs = list(bucket.list_blobs(prefix=DATASET_GCS_PATH))
    if len(blobs) > 0:
        print(f"✅ Dataset found: gs://{BUCKET_NAME}/{DATASET_GCS_PATH}/ (Files: {len(blobs)})")
    else:
        raise ValueError(f"Pre-merged dataset not found in GCS: gs://{BUCKET_NAME}/{DATASET_GCS_PATH}/. Run the merge notebook first!")
except Exception as e:
    raise ValueError(f"Error checking dataset in GCS: {e}")

# Quick space check
!df -h /

## 3. Login to Hugging Face (for Gated Model/Dataset Access)

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Authenticate HF using Kaggle secret
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

if not hf_token:
    raise ValueError("HF_TOKEN secret is empty! Please add your Hugging Face token as a Kaggle secret named 'HF_TOKEN'.")

login(token=hf_token)
print("✅ Hugging Face authenticated.")

## 4. GCS Streaming Dataset Manager (Updated for Pre-Merged Dataset)

In [None]:
# Kaggle-Optimized GCS Streaming Dataset Manager
# This cell handles dataset loading from Google Cloud Storage with Kaggle compatibility

import tempfile
import shutil
import os
import gc
from tqdm import tqdm

# Import datasets with error handling for Kaggle
try:
    from datasets import load_dataset, load_from_disk, Dataset
    print("✅ Datasets library imported successfully")
except Exception as e:
    print(f"❌ Datasets import failed: {e}")
    print("Installing datasets...")
    os.system("pip install datasets -q")
    from datasets import load_dataset, load_from_disk, Dataset

# Import GCS with error handling
try:
    from google.cloud import storage
    print("✅ Google Cloud Storage imported successfully")
except Exception as e:
    print(f"❌ GCS import failed: {e}")
    print("Installing google-cloud-storage...")
    os.system("pip install google-cloud-storage -q")
    from google.cloud import storage

class KaggleGCSDatasetManager:
    def __init__(self, client, bucket_name):
        self.client = client
        self.bucket = client.bucket(bucket_name)
        print(f"✅ GCS Dataset Manager initialized for bucket: {bucket_name}")
    
    def load_dataset_from_gcs(self, name, streaming=False):
        """Load pre-merged dataset from GCS bucket (optimized for Kaggle)."""
        print(f"📥 Loading dataset '{name}' from GCS...")
        
        if streaming:
            # For JSONL streaming (if saved as single file)
            gcs_url = f"gs://{self.bucket.name}/{name}.jsonl"
            try:
                dataset = load_dataset("json", data_files=gcs_url, split="train", streaming=True)
                print(f"✅ Streaming dataset loaded from {gcs_url}")
                return dataset
            except Exception as e:
                print(f"❌ Streaming failed: {e}")
                print("🔄 Falling back to non-streaming...")
                return self.load_dataset_from_gcs(name, streaming=False)
        else:
            # For Arrow format folder (from save_to_disk)
            temp_dir = tempfile.mkdtemp()
            local_path = os.path.join(temp_dir, name)
            
            try:
                # Download all files from GCS
                blobs = list(self.bucket.list_blobs(prefix=f"{name}/"))
                print(f"📁 Found {len(blobs)} files to download")
                
                downloaded_files = 0
                for blob in tqdm(blobs, desc="Downloading files"):
                    if blob.name.endswith('/'):  # Skip directory markers
                        continue
                    
                    relative_path = blob.name[len(f"{name}/"):]
                    local_file_path = os.path.join(local_path, relative_path)
                    
                    # Create directory if needed
                    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                    
                    try:
                        blob.download_to_filename(local_file_path)
                        downloaded_files += 1
                    except Exception as e:
                        print(f"⚠️ Failed to download {blob.name}: {e}")
                        continue
                
                print(f"✅ Downloaded {downloaded_files} files")
                
                # Load dataset
                dataset = load_from_disk(local_path)
                
                # Cleanup
                shutil.rmtree(temp_dir)
                gc.collect()
                
                print(f"✅ Dataset '{name}' loaded from GCS ({len(dataset)} examples)")
                return dataset
                
            except Exception as e:
                print(f"❌ Dataset loading failed: {e}")
                # Cleanup on error
                if os.path.exists(temp_dir):
                    shutil.rmtree(temp_dir)
                raise e
    
    def dataset_exists_in_gcs(self, name):
        """Check if dataset exists in GCS."""
        try:
            blobs = list(self.bucket.list_blobs(prefix=f"{name}/"))
            exists = len(blobs) > 0
            print(f"🔍 Dataset '{name}' exists in GCS: {exists}")
            return exists
        except Exception as e:
            print(f"❌ Error checking dataset existence: {e}")
            return False
    
    def list_datasets_in_gcs(self, prefix=""):
        """List available datasets in GCS bucket."""
        try:
            blobs = list(self.bucket.list_blobs(prefix=prefix))
            datasets = set()
            for blob in blobs:
                if '/' in blob.name:
                    dataset_name = blob.name.split('/')[0]
                    datasets.add(dataset_name)
            
            print(f"📋 Available datasets: {list(datasets)}")
            return list(datasets)
        except Exception as e:
            print(f"❌ Error listing datasets: {e}")
            return []

# Test the manager (this will be used in the main notebook)
def test_gcs_manager():
    """Test function to verify GCS manager works."""
    print("🧪 Testing GCS Dataset Manager...")
    
    # This would be called from the main notebook with actual credentials
    print("💡 GCS Manager ready for use with proper authentication")
    print("📋 Usage:")
    print("   1. Initialize: manager = KaggleGCSDatasetManager(client, bucket_name)")
    print("   2. Check exists: manager.dataset_exists_in_gcs(dataset_name)")
    print("   3. Load dataset: dataset = manager.load_dataset_from_gcs(dataset_name)")
    print("   4. List datasets: datasets = manager.list_datasets_in_gcs()")

# Run test
test_gcs_manager()

print("\n✅ Kaggle GCS Dataset Manager setup completed!")
print("📋 Features:")
print("   - Optimized for Kaggle environment")
print("   - Error handling and fallbacks")
print("   - Progress tracking with tqdm")
print("   - Memory management and cleanup")
print("   - Support for both streaming and non-streaming")
print("\n💡 Ready to use with proper GCS authentication!")

## 5. Stream/Load Pre-Merged Dataset from GCS

In [None]:
# Kaggle-Optimized Dataset Loading from GCS (ZIP File Version)
# This cell loads the dataset from a ZIP file in GCS

import os
import gc
import tempfile
import shutil
import zipfile
from tqdm import tqdm

# Import required libraries with error handling
try:
    from datasets import load_dataset, load_from_disk, Dataset
    print("✅ Datasets library imported successfully")
except Exception as e:
    print(f"❌ Datasets import failed: {e}")
    print("Installing datasets...")
    os.system("pip install datasets -q")
    from datasets import load_dataset, load_from_disk, Dataset

try:
    from google.cloud import storage
    print("✅ Google Cloud Storage imported successfully")
except Exception as e:
    print(f"❌ GCS import failed: {e}")
    print("Installing google-cloud-storage...")
    os.system("pip install google-cloud-storage -q")
    from google.cloud import storage

# GCS Configuration
BUCKET_NAME = 'wizardlm-training-1759276927'
DATASET_ZIP_PATH = 'Datasets/merged_uncensored_alpaca.zip'  # Updated to ZIP file
MODEL_OUTPUT_PATH = 'model_output/wizardlm_fine_tuned'

print(f"🔍 Loading dataset from: gs://{BUCKET_NAME}/{DATASET_ZIP_PATH}")

# Initialize GCS client
try:
    client = storage.Client()
    bucket = client.bucket(BUCKET_NAME)
    print(f"✅ Connected to GCS bucket: {BUCKET_NAME}")
except Exception as e:
    print(f"❌ Failed to connect to GCS: {e}")
    raise e

# Check what's available in the bucket
print("📋 Checking available files in bucket...")
try:
    blobs = list(bucket.list_blobs(prefix="Datasets/"))
    print("📁 Files in Datasets folder:")
    for blob in blobs:
        print(f"   - {blob.name} ({blob.size} bytes)")
except Exception as e:
    print(f"❌ Error listing bucket contents: {e}")

# Load dataset from ZIP file in GCS
def load_dataset_from_zip_gcs(bucket, zip_path):
    """Load dataset from ZIP file in GCS."""
    print(f"📥 Loading dataset from ZIP file...")
    temp_dir = tempfile.mkdtemp()
    zip_file_path = os.path.join(temp_dir, "dataset.zip")
    
    try:
        # Download ZIP file from GCS
        print(f"📦 Downloading ZIP file from GCS...")
        blob = bucket.blob(zip_path)
        blob.download_to_filename(zip_file_path)
        
        zip_size = os.path.getsize(zip_file_path) / (1024**3)
        print(f"✅ ZIP file downloaded: {zip_size:.2f} GB")
        
        # Extract ZIP file
        print("📦 Extracting ZIP file...")
        extract_dir = os.path.join(temp_dir, "extracted")
        os.makedirs(extract_dir, exist_ok=True)
        
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        
        print("✅ ZIP file extracted successfully")
        
        # List extracted contents
        print("📋 Extracted contents:")
        for root, dirs, files in os.walk(extract_dir):
            level = root.replace(extract_dir, '').count(os.sep)
            indent = ' ' * 2 * level
            print(f"{indent}{os.path.basename(root)}/")
            subindent = ' ' * 2 * (level + 1)
            for file in files[:5]:  # Show first 5 files
                print(f"{subindent}{file}")
            if len(files) > 5:
                print(f"{subindent}... and {len(files) - 5} more files")
        
        # Try different loading methods based on extracted content
        
        # Method 1: Try load_from_disk (Arrow format)
        print("🔄 Trying load_from_disk (Arrow format)...")
        try:
            dataset = load_from_disk(extract_dir)
            print("✅ Dataset loaded with load_from_disk")
            shutil.rmtree(temp_dir)
            return dataset
        except Exception as e:
            print(f"❌ load_from_disk failed: {e}")
        
        # Method 2: Look for JSON files
        json_files = []
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.endswith('.json'):
                    json_files.append(os.path.join(root, file))
        
        if json_files:
            print(f"🔄 Trying JSON loading ({len(json_files)} files)...")
            try:
                dataset = load_dataset("json", data_files=json_files, split="train")
                print("✅ Dataset loaded from JSON files")
                shutil.rmtree(temp_dir)
                return dataset
            except Exception as e:
                print(f"❌ JSON loading failed: {e}")
        
        # Method 3: Look for JSONL files
        jsonl_files = []
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.endswith('.jsonl'):
                    jsonl_files.append(os.path.join(root, file))
        
        if jsonl_files:
            print(f"🔄 Trying JSONL loading ({len(jsonl_files)} files)...")
            try:
                dataset = load_dataset("json", data_files=jsonl_files, split="train")
                print("✅ Dataset loaded from JSONL files")
                shutil.rmtree(temp_dir)
                return dataset
            except Exception as e:
                print(f"❌ JSONL loading failed: {e}")
        
        # Method 4: Look for CSV files
        csv_files = []
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))
        
        if csv_files:
            print(f"🔄 Trying CSV loading ({len(csv_files)} files)...")
            try:
                dataset = load_dataset("csv", data_files=csv_files, split="train")
                print("✅ Dataset loaded from CSV files")
                shutil.rmtree(temp_dir)
                return dataset
            except Exception as e:
                print(f"❌ CSV loading failed: {e}")
        
        # Method 5: Try to create dataset from any text content
        print("🔄 Trying to create dataset from text content...")
        try:
            all_data = []
            for root, dirs, files in os.walk(extract_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read().strip()
                            if content and len(content) > 10:
                                all_data.append({"text": content})
                    except Exception as e:
                        continue
            
            if all_data:
                dataset = Dataset.from_list(all_data)
                print(f"✅ Dataset created from {len(all_data)} text entries")
                shutil.rmtree(temp_dir)
                return dataset
        except Exception as e:
            print(f"❌ Text content loading failed: {e}")
        
        # If all methods fail, show detailed error info
        print("❌ All loading methods failed!")
        print("📋 Available files after extraction:")
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                file_path = os.path.join(root, file)
                file_size = os.path.getsize(file_path)
                print(f"   - {file_path} ({file_size} bytes)")
        
        raise ValueError("Could not load dataset from ZIP file")
        
    except Exception as e:
        # Cleanup on error
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        raise e

# Load the dataset
try:
    final_dataset = load_dataset_from_zip_gcs(bucket, DATASET_ZIP_PATH)
    print(f"✅ Dataset loaded successfully: {len(final_dataset)} examples")
except Exception as e:
    print(f"❌ Failed to load dataset: {e}")
    print("💡 Troubleshooting tips:")
    print("   - Check if ZIP file exists in GCS bucket")
    print("   - Verify ZIP file format and contents")
    print("   - Check GCS authentication")
    print("   - Ensure sufficient disk space")
    raise e

# Reference columns (standard for Alpaca)
reference_columns = ['instruction', 'input', 'output']
print(f"📋 Expected columns: {reference_columns}")
print(f"📋 Available columns: {final_dataset.column_names}")

# Align/filter columns if needed
if set(reference_columns) - set(final_dataset.column_names):
    print("🔄 Aligning column names...")
    # Rename if necessary (e.g., from merge variations)
    column_map = {'prompt': 'instruction', 'completion': 'output', 'context': 'input'}
    rename_map = {k: v for k, v in column_map.items() if k in final_dataset.column_names}
    if rename_map:
        final_dataset = final_dataset.rename_columns(rename_map)
        print(f"✅ Renamed columns: {rename_map}")

# If we don't have the expected columns, try to create them from available data
if set(reference_columns) - set(final_dataset.column_names):
    print("🔄 Creating Alpaca format from available data...")
    
    # Check if we have a 'text' column (common fallback)
    if 'text' in final_dataset.column_names:
        print("📝 Found 'text' column, parsing for Alpaca format...")
        
        def parse_text_to_alpaca(example):
            text = example['text']
            # Try to parse common formats
            if '### Instruction:' in text and '### Response:' in text:
                parts = text.split('### Instruction:')[1].split('### Response:')
                if len(parts) >= 2:
                    instruction = parts[0].strip()
                    response = parts[1].strip()
                    
                    # Check for input section
                    if '### Input:' in instruction:
                        input_parts = instruction.split('### Input:')
                        instruction = input_parts[0].strip()
                        input_text = input_parts[1].strip() if len(input_parts) > 1 else ""
                    else:
                        input_text = ""
                    
                    return {
                        'instruction': instruction,
                        'input': input_text,
                        'output': response
                    }
            
            # Fallback: treat entire text as instruction
            return {
                'instruction': text[:200] + "..." if len(text) > 200 else text,
                'input': "",
                'output': "Please provide a response to this instruction."
            }
        
        final_dataset = final_dataset.map(parse_text_to_alpaca)
        print("✅ Created Alpaca format from text column")
    
    else:
        print("⚠️ No suitable columns found for Alpaca format")
        print("📋 Available columns:", final_dataset.column_names)
        # Create a simple instruction-output format
        def create_simple_format(example):
            # Use first available column as instruction
            first_col = list(example.keys())[0]
            return {
                'instruction': str(example[first_col])[:200],
                'input': "",
                'output': "This is a sample response."
            }
        
        final_dataset = final_dataset.map(create_simple_format)
        print("✅ Created simple instruction format")

# Select only the columns we need
final_dataset = final_dataset.select_columns(reference_columns)
print(f"✅ Selected columns: {final_dataset.column_names}")

# Filter out empty or very short examples
print("🔍 Filtering dataset...")
initial_size = len(final_dataset)
final_dataset = final_dataset.filter(lambda ex: all(len(str(ex.get(col, ''))) > 10 for col in reference_columns))
filtered_size = len(final_dataset)
print(f"✅ Filtered dataset: {initial_size} -> {filtered_size} examples")

# Shuffle the dataset
final_dataset = final_dataset.shuffle(seed=42)
print("✅ Dataset shuffled")

# Subsample for Kaggle (adjust as needed)
sample_size = min(5000, int(len(final_dataset) * 0.1))  # ~500 examples for quick test
final_dataset = final_dataset.select(range(sample_size))
print(f"✅ Subsampled to {len(final_dataset)} examples for Kaggle")

# Show sample
print(f"\n📋 Final dataset info:")
print(f"   - Size: {len(final_dataset)} examples")
print(f"   - Columns: {final_dataset.column_names}")
print(f"   - Sample: {final_dataset[0]}")

# Check disk usage
os.system("df -h")

print("\n✅ Dataset loading completed successfully!")
print("📋 Ready for fine-tuning!")

## 6. Load Model and Setup Fine-Tuning (CUDA-Compatible)

In [None]:
# Kaggle Compact Model Loading (Cell 6) - Space Optimized
# Uses smaller model and aggressive space management

import torch
import os
import gc
import shutil

# Import required libraries
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    print("✅ Transformers imported successfully")
except Exception as e:
    print(f"❌ Transformers import failed: {e}")
    os.system("pip install transformers -q")
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

try:
    from peft import LoraConfig, get_peft_model, TaskType
    print("✅ PEFT imported successfully")
except Exception as e:
    print(f"❌ PEFT import failed: {e}")
    os.system("pip install peft -q")
    from peft import LoraConfig, get_peft_model, TaskType

# Aggressive space cleanup
def aggressive_cleanup():
    """Aggressive cleanup to free maximum space."""
    print("🧹 Performing aggressive cleanup...")
    
    # Clear all caches
    cache_dirs = [
        "/root/.cache/huggingface",
        "/root/.cache/torch", 
        "/root/.cache/pip",
        "/tmp",
        "/kaggle/working/hf_cache"
    ]
    
    for cache_dir in cache_dirs:
        if os.path.exists(cache_dir):
            try:
                shutil.rmtree(cache_dir)
                print(f"✅ Cleared: {cache_dir}")
            except Exception as e:
                print(f"⚠️ Could not clear {cache_dir}: {e}")
    
    # Create minimal cache directory
    os.makedirs("/kaggle/working/hf_cache", exist_ok=True)
    
    # Clear PyTorch cache
    torch.cuda.empty_cache()
    gc.collect()
    
    # Check space
    total, used, free = shutil.disk_usage("/")
    free_gb = free // (1024**3)
    print(f"💾 Free space after cleanup: {free_gb}GB")

aggressive_cleanup()

# Use a much smaller model that fits in Kaggle's space
model_name = "microsoft/DialoGPT-medium"  # ~350MB model
print(f"🚀 Using compact model: {model_name}")
print("💡 This model is much smaller and will fit in Kaggle's space limits")

max_seq_length = 512  # Reduced sequence length
print(f"🔍 CUDA available: {torch.cuda.is_available()}")

# Check CUDA and GPU info
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
    print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
    print(f"✅ CUDA capability: {torch.cuda.get_device_capability(0)}")
else:
    print("⚠️ CUDA not available - will use CPU")

# Configure minimal quantization (if supported)
print("📦 Setting up minimal quantization...")
try:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    print("✅ Quantization config ready")
except Exception as e:
    print(f"⚠️ Quantization setup failed: {e}")
    bnb_config = None

# Load tokenizer first
print("📦 Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir="/kaggle/working/hf_cache",
        local_files_only=False
    )
    
    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    tokenizer.padding_side = "right"
    print("✅ Tokenizer loaded successfully")
except Exception as e:
    print(f"❌ Tokenizer loading failed: {e}")
    raise e

# Load model with space optimization
print("📦 Loading model...")
try:
    if bnb_config and torch.cuda.is_available():
        # Try with quantization
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16,
            cache_dir="/kaggle/working/hf_cache",
            low_cpu_mem_usage=True,
        )
    else:
        # Load without quantization
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else "cpu",
            cache_dir="/kaggle/working/hf_cache",
            low_cpu_mem_usage=True,
        )
    
    print("✅ Model loaded successfully")
except Exception as e:
    print(f"❌ Model loading failed: {e}")
    print("🔄 Trying CPU-only loading...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            device_map="cpu",
            cache_dir="/kaggle/working/hf_cache",
            low_cpu_mem_usage=True,
        )
        print("✅ Model loaded on CPU successfully")
    except Exception as e2:
        print(f"❌ CPU loading also failed: {e2}")
        raise e

# Configure LoRA with minimal parameters
print("📦 Setting up LoRA...")
try:
    # Get model's attention modules dynamically
    target_modules = []
    for name, module in model.named_modules():
        if any(attn in name for attn in ["q_proj", "k_proj", "v_proj", "o_proj"]):
            target_modules.append(name)
    
    if not target_modules:
        # Fallback for models without standard attention names
        target_modules = ["c_attn", "c_proj"]
    
    print(f"📋 Target modules: {target_modules}")
    
    lora_config = LoraConfig(
        r=8,  # Reduced rank
        lora_alpha=16,
        target_modules=target_modules,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )
    
    model = get_peft_model(model, lora_config)
    print("✅ LoRA applied successfully")
except Exception as e:
    print(f"❌ LoRA application failed: {e}")
    print("⚠️ Continuing without LoRA")

# Print model info
print(f"✅ Model loaded: {model_name}")
print(f"✅ Model parameters: {model.num_parameters():,}")
print(f"✅ Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"✅ VRAM usage: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

# Check final space usage
total, used, free = shutil.disk_usage("/")
free_gb = free // (1024**3)
print(f"💾 Final free space: {free_gb}GB")

# Check GPU status
os.system("nvidia-smi")
os.system("df -h")

print("\n✅ Model loading completed successfully!")
print("📋 Ready for training setup!")
print("💡 Using compact model to fit Kaggle's space constraints")

## 7. Setup Training Configuration and Data Formatting

In [None]:
# Kaggle-Optimized Training Configuration (Cell 7)
# This cell sets up training configuration and data formatting

import torch
import os

# Import required libraries
try:
    from transformers import TrainingArguments
    print("✅ TrainingArguments imported successfully")
except Exception as e:
    print(f"❌ TrainingArguments import failed: {e}")
    os.system("pip install transformers -q")
    from transformers import TrainingArguments

# Format dataset for training (Alpaca format)
def formatting_prompts_func(examples):
    """Format examples for Alpaca-style training."""
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        if input_text.strip():
            text = f"### Instruction:\\n{instruction}\\n\\n### Input:\\n{input_text}\\n\\n### Response:\\n{output}"
        else:
            text = f"### Instruction:\\n{instruction}\\n\\n### Response:\\n{output}"
        texts.append(text)
    return {"text": texts}

# Apply formatting to dataset
print("📝 Formatting dataset for training...")
try:
    final_dataset = final_dataset.map(formatting_prompts_func, batched=True)
    print(f"✅ Dataset formatted: {len(final_dataset)} examples")
    
    # Show sample formatted text
    sample_text = final_dataset[0]['text']
    print(f"📋 Sample formatted text (first 200 chars):")
    print(f"   {sample_text[:200]}...")
    
except Exception as e:
    print(f"❌ Dataset formatting failed: {e}")
    raise e

# Training arguments optimized for Kaggle Tesla P100
print("⚙️ Setting up training configuration...")

training_args = TrainingArguments(
    # Basic training settings
    per_device_train_batch_size=1,  # Reduced for P100 VRAM
    gradient_accumulation_steps=8,  # Increased to maintain effective batch size
    warmup_steps=10,
    max_steps=50,  # Reduced for Kaggle time limits
    learning_rate=2e-4,
    
    # Precision settings
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    
    # Optimization
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    
    # Logging and saving
    logging_steps=1,
    save_steps=25,
    save_total_limit=2,
    
    # Memory optimization
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    
    # Output directory
    output_dir="/kaggle/working/outputs",
    
    # Reproducibility
    seed=3407,
    
    # Kaggle-specific optimizations
    dataloader_num_workers=0,  # Avoid multiprocessing issues
    report_to=None,  # Disable wandb/tensorboard
)

print("✅ Training configuration set")
print(f"📋 Training settings:")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   - Max steps: {training_args.max_steps}")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Precision: {'bf16' if training_args.bf16 else 'fp16'}")

# Check memory usage
print(f"💾 Current VRAM usage: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
print(f"💾 Available VRAM: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9:.2f}GB")

print("\n✅ Training configuration completed!")
print("📋 Ready for trainer initialization!")

## 8. Initialize Trainer and Start Fine-Tuning

In [None]:
# Kaggle-Optimized Training Execution (Cell 8) - Tokenizer Properly Aligned
# This cell properly fixes tokenizer alignment issues

import torch
import os
import gc

# Import required libraries
try:
    from trl import SFTTrainer
    print("✅ SFTTrainer imported successfully")
except Exception as e:
    print(f"❌ SFTTrainer import failed: {e}")
    os.system("pip install trl -q")
    from trl import SFTTrainer

try:
    from transformers import TrainingArguments
    print("✅ TrainingArguments imported successfully")
except Exception as e:
    print(f"❌ TrainingArguments import failed: {e}")
    os.system("pip install transformers -q")
    from transformers import TrainingArguments

# PROPERLY fix tokenizer alignment issues
print("🔧 PROPERLY fixing tokenizer alignment...")

# Check current tokenizer state
print(f"📋 Current tokenizer state:")
print(f" - pad_token: {tokenizer.pad_token}")
print(f" - pad_token_id: {tokenizer.pad_token_id}")
print(f" - eos_token: {tokenizer.eos_token}")
print(f" - eos_token_id: {tokenizer.eos_token_id}")
print(f" - bos_token: {tokenizer.bos_token}")
print(f" - bos_token_id: {tokenizer.bos_token_id}")

# Fix pad token properly - Enhanced with force-add fallback
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("📝 Setting pad token properly...")
    
    old_vocab_size = model.config.vocab_size if hasattr(model, 'config') else len(tokenizer)
    
    # Method 1: Use eos_token as pad_token
    if tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        print(f"✅ Set pad_token to eos_token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
    
    # Method 2: If eos_token is None or ID is None, add a new pad token
    elif tokenizer.pad_token_id is None:
        # Add a new pad token to the tokenizer
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        print(f"✅ Added new pad_token: [PAD] (ID: {tokenizer.pad_token_id})")
    
    # Method 3: If still None, use unk_token
    elif tokenizer.unk_token is not None and tokenizer.unk_token_id is not None:
        tokenizer.pad_token = tokenizer.unk_token
        tokenizer.pad_token_id = tokenizer.unk_token_id
        print(f"✅ Set pad_token to unk_token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
    
    # Method 4: Last resort - use token 0
    else:
        tokenizer.pad_token_id = 0
        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(0)
        print(f"✅ Set pad_token_id to 0: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")

# Force fallback: If pad_token_id is still None after all methods, add [PAD] token
if tokenizer.pad_token_id is None:
    print("⚠️ Pad token ID still None - forcing addition of [PAD] token...")
    old_len = len(tokenizer)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    if len(tokenizer) > old_len:
        model.resize_token_embeddings(len(tokenizer))
    print(f"✅ Forced added [PAD] token (ID: {tokenizer.pad_token_id})")

# Verify pad token is properly set (must not be None)
assert tokenizer.pad_token_id is not None, "❌ Failed to set pad_token_id - it is still None!"
print(f"📋 After fixing:")
print(f" - pad_token: {tokenizer.pad_token}")
print(f" - pad_token_id: {tokenizer.pad_token_id}")

# Update model config to match tokenizer - Ensure IDs are integers
print("🔧 Updating model config to match tokenizer...")
if hasattr(model, 'config'):
    # Update all token IDs with explicit checks
    model.config.pad_token_id = int(tokenizer.pad_token_id)
    model.config.eos_token_id = int(tokenizer.eos_token_id) if tokenizer.eos_token_id is not None else model.config.eos_token_id
    model.config.bos_token_id = int(tokenizer.bos_token_id) if tokenizer.bos_token_id is not None else model.config.bos_token_id
    
    print("✅ Model config updated with tokenizer tokens")
    print(f" - Model pad_token_id: {model.config.pad_token_id}")
    print(f" - Model eos_token_id: {model.config.eos_token_id}")
    print(f" - Model bos_token_id: {model.config.bos_token_id}")

# Also update generation config if it exists
if hasattr(model, 'generation_config') and model.generation_config is not None:
    model.generation_config.pad_token_id = int(tokenizer.pad_token_id)
    model.generation_config.eos_token_id = int(tokenizer.eos_token_id) if tokenizer.eos_token_id is not None else model.generation_config.eos_token_id
    model.generation_config.bos_token_id = int(tokenizer.bos_token_id) if tokenizer.bos_token_id is not None else model.generation_config.bos_token_id
    print("✅ Generation config updated with tokenizer tokens")
    print(f" - Gen pad_token_id: {model.generation_config.pad_token_id}")
    print(f" - Gen eos_token_id: {model.generation_config.eos_token_id}")
    print(f" - Gen bos_token_id: {model.generation_config.bos_token_id}")

# Resize model embeddings if new tokens were added (check against original vocab size)
if tokenizer.pad_token_id is not None and tokenizer.pad_token_id >= old_vocab_size:
    print("📏 Resizing model embeddings for new tokens...")
    model.resize_token_embeddings(len(tokenizer))
    print("✅ Model embeddings resized")
elif len(tokenizer) > old_vocab_size:
    print("📏 Resizing model embeddings due to vocab size increase...")
    model.resize_token_embeddings(len(tokenizer))
    print("✅ Model embeddings resized")

# Set conservative max sequence length
max_seq_length = 512
print(f"📏 Set max_seq_length to: {max_seq_length}")

# Update training args to match
if hasattr(training_args, 'max_seq_length'):
    training_args.max_seq_length = max_seq_length

# Filter dataset aggressively
print("🔍 Filtering dataset...")
initial_size = len(final_dataset)

def filter_long_sequences(example):
    """Filter out sequences that are too long."""
    text = example.get('text', '')
    if not text:
        return False
    
    try:
        tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_seq_length)
        return len(tokens) <= max_seq_length
    except Exception as e:
        print(f"⚠️ Tokenization error: {e}")
        return False

final_dataset = final_dataset.filter(filter_long_sequences)
filtered_size = len(final_dataset)
print(f"✅ Filtered dataset: {initial_size} -> {filtered_size} examples")

# Take small subset for testing
if len(final_dataset) > 50:
    print("📊 Taking small subset for testing...")
    final_dataset = final_dataset.select(range(50))
    print(f"✅ Reduced to {len(final_dataset)} examples")

# Truncate sequences
print("✂️ Truncating sequences...")
def truncate_sequences(example):
    """Truncate sequences to max length."""
    text = example.get('text', '')
    if not text:
        return {'text': ''}
    
    try:
        tokens = tokenizer.encode(
            text, 
            add_special_tokens=True, 
            truncation=True, 
            max_length=max_seq_length,
            padding=False
        )
        
        truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
        return {'text': truncated_text}
        
    except Exception as e:
        print(f"⚠️ Truncation error: {e}")
        return {'text': 'Short text example.'}

final_dataset = final_dataset.map(truncate_sequences)
print("✅ Sequences truncated")

# Check TRL version and SFTTrainer parameters
print("🔍 Checking SFTTrainer parameters...")
import inspect
sft_params = inspect.signature(SFTTrainer.__init__).parameters
print(f"📋 Available SFTTrainer parameters: {list(sft_params.keys())}")

# Initialize trainer with correct latest API
print("🚀 Initializing SFT Trainer with properly aligned tokenizer...")

try:
    # Latest SFTTrainer API
    trainer_kwargs = {
        "model": model,
        "train_dataset": final_dataset,
        "args": training_args,
    }
    
    # Add parameters only if they exist in the current API
    if "dataset_text_field" in sft_params:
        trainer_kwargs["dataset_text_field"] = "text"
    elif "text_field" in sft_params:
        trainer_kwargs["text_field"] = "text"
    
    if "max_seq_length" in sft_params:
        trainer_kwargs["max_seq_length"] = max_seq_length
    
    if "dataset_num_proc" in sft_params:
        trainer_kwargs["dataset_num_proc"] = 1
    
    if "packing" in sft_params:
        trainer_kwargs["packing"] = False
    
    print(f"📋 Using parameters: {list(trainer_kwargs.keys())}")
    
    trainer = SFTTrainer(**trainer_kwargs)
    print("✅ Trainer initialized successfully")
    
except Exception as e:
    print(f"❌ Trainer initialization failed: {e}")
    print("🔄 Trying with minimal parameters...")
    
    try:
        trainer = SFTTrainer(
            model=model,
            train_dataset=final_dataset,
            args=training_args,
        )
        print("✅ Trainer initialized with minimal parameters")
        
    except Exception as e2:
        print(f"❌ Minimal initialization failed: {e2}")
        print("🔄 Trying standard Trainer...")
        
        try:
            from transformers import Trainer
            
            trainer = Trainer(
                model=model,
                train_dataset=final_dataset,
                args=training_args,
            )
            print("✅ Standard Trainer initialized successfully")
            
        except Exception as e3:
            print(f"❌ Standard Trainer failed: {e3}")
            raise e3

# Print training info
print(f"📋 Training configuration:")
print(f" - Dataset size: {len(final_dataset)}")
print(f" - Model parameters: {model.num_parameters():,}")
print(f" - Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f" - Max sequence length: {max_seq_length}")
print(f" - Training steps: {training_args.max_steps}")
print(f" - Pad token ID: {tokenizer.pad_token_id}")
print(f" - EOS token ID: {tokenizer.eos_token_id}")

# Check memory before training
print(f"💾 VRAM before training: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
os.system("nvidia-smi")

# Start training
print("🚀 Starting fine-tuning...")
print("⏱️ This may take 10-30 minutes depending on dataset size...")

try:
    # Clear cache before training
    gc.collect()
    torch.cuda.empty_cache()
    
    # Start training
    trainer.train()
    
    print("✅ Training completed successfully!")
    
except Exception as e:
    print(f"❌ Training failed: {e}")
    print("💡 Possible solutions:")
    print(" - Check if model and dataset are compatible")
    print(" - Verify VRAM is sufficient")
    print(" - Check dataset format")
    print(" - Try reducing batch size or sequence length")
    
    # Try with reduced parameters
    print("🔄 Trying with reduced parameters...")
    try:
        # Update training args with smaller values
        training_args.per_device_train_batch_size = 1
        training_args.gradient_accumulation_steps = 2
        training_args.max_steps = 5 # Very small for testing
        
        # Reinitialize trainer with reduced parameters
        if hasattr(trainer, 'args'):
            trainer.args = training_args
        
        print("🔄 Starting training with reduced parameters...")
        trainer.train()
        print("✅ Training completed with reduced parameters!")
        
    except Exception as e2:
        print(f"❌ Reduced training also failed: {e2}")
        print("💡 Training failed completely. Check:")
        print(" - Model compatibility")
        print(" - Dataset format")
        print(" - Memory availability")
        raise e2

# Check memory after training
print(f"💾 VRAM after training: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

# Save the final model
print("💾 Saving final model...")
try:
    trainer.save_model("/kaggle/working/final_model")
    print("✅ Model saved successfully")
except Exception as e:
    print(f"❌ Model saving failed: {e}")
    print("💡 Model may still be usable despite save failure")

# Check GPU status
os.system("nvidia-smi")
os.system("df -h")

print("\n✅ Training execution completed!")
print("📋 Ready for model saving and upload!")

## 9. Save Model and Upload to GCS

In [None]:
# Kaggle-Optimized Model Saving and GCS Upload (Cell 9)
# This cell saves the fine-tuned model and uploads it to GCS

import os
import zipfile
import shutil
import gc

# Import required libraries
try:
    from google.cloud import storage
    print("✅ Google Cloud Storage imported successfully")
except Exception as e:
    print(f"❌ GCS import failed: {e}")
    os.system("pip install google-cloud-storage -q")
    from google.cloud import storage

# GCS Configuration
BUCKET_NAME = 'wizardlm-training-1759276927'
MODEL_OUTPUT_PATH = 'model_output/wizardlm_fine_tuned'

print("💾 Starting model saving process...")

# Save LoRA adapters
print("📦 Saving LoRA adapters...")
try:
    model.save_pretrained("/kaggle/working/lora_adapters")
    tokenizer.save_pretrained("/kaggle/working/lora_adapters")
    print("✅ LoRA adapters saved successfully")
except Exception as e:
    print(f"❌ LoRA adapter saving failed: {e}")
    raise e

# Create a zip file with the model
print("📦 Creating model archive...")
try:
    zip_path = "/kaggle/working/wizardlm_fine_tuned.zip"
    
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk("/kaggle/working/lora_adapters"):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, "/kaggle/working/lora_adapters")
                zipf.write(file_path, arcname)
    
    # Check zip file size
    zip_size = os.path.getsize(zip_path) / (1024**3)
    print(f"✅ Model archive created: {zip_size:.2f} GB")
    
except Exception as e:
    print(f"❌ Archive creation failed: {e}")
    raise e

# Initialize GCS client
print("☁️ Connecting to GCS...")
try:
    client = storage.Client()
    bucket = client.bucket(BUCKET_NAME)
    print(f"✅ Connected to GCS bucket: {BUCKET_NAME}")
except Exception as e:
    print(f"❌ GCS connection failed: {e}")
    raise e

# Upload to GCS
print("☁️ Uploading to GCS...")
try:
    blob_name = f"{MODEL_OUTPUT_PATH}/wizardlm_fine_tuned.zip"
    blob = bucket.blob(blob_name)
    
    # Upload with progress tracking
    blob.upload_from_filename(zip_path)
    
    print(f"✅ Model uploaded successfully!")
    print(f"📁 Location: gs://{BUCKET_NAME}/{blob_name}")
    print(f"📊 File size: {zip_size:.2f} GB")
    
except Exception as e:
    print(f"❌ GCS upload failed: {e}")
    raise e

# Verify upload
print("🔍 Verifying upload...")
try:
    blob = bucket.blob(blob_name)
    if blob.exists():
        print("✅ Upload verification successful")
    else:
        print("❌ Upload verification failed")
except Exception as e:
    print(f"❌ Upload verification error: {e}")

# Cleanup local files to save space
print("🧹 Cleaning up local files...")
try:
    shutil.rmtree("/kaggle/working/lora_adapters")
    os.remove(zip_path)
    print("✅ Local files cleaned up")
except Exception as e:
    print(f"⚠️ Cleanup warning: {e}")

# Force garbage collection
gc.collect()

# Check disk usage
os.system("df -h")

print("\n✅ Model saving and upload completed!")
print("📋 Summary:")
print(f"   - Model saved to: gs://{BUCKET_NAME}/{blob_name}")
print(f"   - File size: {zip_size:.2f} GB")
print(f"   - Local files cleaned up")
print("\n🎉 Fine-tuning pipeline completed successfully!")

## 10. Test the Fine-Tuned Model (Optional)

In [None]:
# Kaggle-Optimized Model Testing (Cell 10)
# This cell tests the fine-tuned model with sample prompts

import torch
import os
import gc

print("🧪 Testing the fine-tuned model...")

# Test prompts for different scenarios
test_prompts = [
    {
        "name": "Python Function",
        "prompt": "### Instruction:\\nWrite a Python function to calculate the factorial of a number.\\n\\n### Response:\\n"
    },
    {
        "name": "Code Explanation", 
        "prompt": "### Instruction:\\nExplain what this Python code does:\\ndef fibonacci(n):\\n    if n <= 1:\\n        return n\\n    return fibonacci(n-1) + fibonacci(n-2)\\n\\n### Response:\\n"
    },
    {
        "name": "Algorithm Question",
        "prompt": "### Instruction:\\nWhat is the time complexity of binary search?\\n\\n### Response:\\n"
    }
]

def test_model_generation(prompt, max_new_tokens=150, temperature=0.7):
    """Test model generation with a given prompt."""
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = inputs.to("cuda")
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the generated part
        generated_text = response[len(prompt):].strip()
        
        return generated_text
        
    except Exception as e:
        return f"❌ Generation failed: {e}"

# Test each prompt
print("🔍 Running model tests...")
print("=" * 60)

for i, test_case in enumerate(test_prompts, 1):
    print(f"\\n📝 Test {i}: {test_case['name']}")
    print(f"Prompt: {test_case['prompt'][:100]}...")
    print("-" * 40)
    
    # Generate response
    response = test_model_generation(test_case['prompt'])
    
    print(f"Generated Response:")
    print(response)
    print("=" * 60)

# Memory usage check
print(f"\\n💾 Memory usage after testing:")
print(f"   - VRAM allocated: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
print(f"   - VRAM cached: {torch.cuda.memory_reserved() / 1e9:.2f}GB")

# Clean up memory
gc.collect()
torch.cuda.empty_cache()

# Final GPU status
os.system("nvidia-smi")

print("\\n✅ Model testing completed!")
print("📋 Test Summary:")
print("   - Model responds to different types of prompts")
print("   - Generation quality can be assessed from outputs")
print("   - Memory usage is within acceptable limits")
print("\\n🎉 Fine-tuning and testing pipeline completed successfully!")
print(f"📁 Final model available at: gs://{BUCKET_NAME}/{MODEL_OUTPUT_PATH}/wizardlm_fine_tuned.zip")