## 1Ô∏è‚É£ Clone Repository & Install Dependencies

In [None]:
# Clone the repository (or upload your files)
!git clone https://github.com/YOUR_USERNAME/LLM_Fine-Tuning_Platform.git
%cd LLM_Fine-Tuning_Platform

# Or if you want to upload manually, uncomment:
# from google.colab import files
# print("Upload your project as a zip file")
# uploaded = files.upload()
# !unzip LLM_Fine-Tuning_Platform.zip
# %cd LLM_Fine-Tuning_Platform

In [None]:
# Install all dependencies
%%capture
!pip install -r requirements.txt
!pip install nest-asyncio pyngrok  # Additional for Colab

In [None]:
# Verify GPU
import torch
print(f"üéÆ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéØ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("‚ö†Ô∏è No GPU found! Enable GPU runtime.")

## 2Ô∏è‚É£ Setup Redis (In-Memory Broker)

In [None]:
# Install and start Redis
%%capture
!apt-get install -y redis-server
!redis-server --daemonize yes

# Verify Redis is running
import time
time.sleep(2)
!redis-cli ping

## 3Ô∏è‚É£ Setup Environment Variables

In [None]:
import os
from google.colab import userdata

# Set HuggingFace token (add it to Colab Secrets: üîë icon on left)
# Or enter it directly here (not recommended for shared notebooks)
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
except:
    HF_TOKEN = input("Enter your HuggingFace token: ")

# Set environment variables
os.environ['HF_TOKEN'] = HF_TOKEN
os.environ['REDIS_HOST'] = 'localhost'
os.environ['REDIS_PORT'] = '6379'
os.environ['STORAGE_TYPE'] = 'local'

print("‚úÖ Environment configured")

## 4Ô∏è‚É£ Create Test Dataset

In [None]:
# Create example dataset if it doesn't exist
import json

if not os.path.exists('example_data.jsonl'):
    example_data = [
        {"instruction": "What is machine learning?", "output": "Machine learning is a subset of artificial intelligence that enables computers to learn from data.", "input": ""},
        {"instruction": "Write a Python function to add two numbers", "output": "def add(a, b):\n    return a + b", "input": ""},
        {"instruction": "Translate to Spanish", "output": "Hola, ¬øc√≥mo est√°s?", "input": "Hello, how are you?"},
        {"instruction": "What is the capital of France?", "output": "The capital of France is Paris.", "input": ""},
        {"instruction": "Explain photosynthesis", "output": "Photosynthesis is the process by which plants convert sunlight into energy.", "input": ""}
    ]
    
    with open('example_data.jsonl', 'w') as f:
        for item in example_data:
            f.write(json.dumps(item) + '\n')
    
    print("‚úÖ Created example_data.jsonl")
else:
    print("‚úÖ example_data.jsonl already exists")

## 5Ô∏è‚É£ Start Celery Worker in Background

In [None]:
# Start Celery worker in background
import subprocess
import time

# Kill any existing celery workers
!pkill -f "celery worker" || true
time.sleep(2)

# Start worker in background
worker_process = subprocess.Popen(
    ['celery', '-A', 'phase2.celery_config', 'worker', '--loglevel=info', '-Q', 'training'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

print("üöÄ Starting Celery worker...")
time.sleep(5)

# Check if worker started
result = subprocess.run(
    ['celery', '-A', 'phase2.celery_config', 'inspect', 'active'],
    capture_output=True,
    text=True
)

if 'Error' not in result.stderr:
    print("‚úÖ Celery worker started successfully")
else:
    print("‚ö†Ô∏è Worker may not have started. Check logs below:")
    print(result.stderr[:500])

## 6Ô∏è‚É£ Setup ngrok for Public API Access

In [None]:
# Setup ngrok to expose FastAPI publicly
from pyngrok import ngrok

# Optional: Set your ngrok auth token for better limits
# Get one free at: https://dashboard.ngrok.com/get-started/your-authtoken
try:
    NGROK_TOKEN = userdata.get('NGROK_TOKEN')
    ngrok.set_auth_token(NGROK_TOKEN)
except:
    print("‚ö†Ô∏è No ngrok token found. Using free tier (may have limits)")

# Kill any existing ngrok tunnels
ngrok.kill()

print("‚úÖ ngrok configured")

## 7Ô∏è‚É£ Start FastAPI Server

In [None]:
# Run FastAPI with ngrok in background
import nest_asyncio
import uvicorn
from threading import Thread

# Allow nested event loops (required for Colab)
nest_asyncio.apply()

# Import FastAPI app
from phase2.api import app

# Start ngrok tunnel
public_url = ngrok.connect(8000, bind_tls=True)
print(f"\n{'='*60}")
print(f"üåê Public API URL: {public_url}")
print(f"üìö API Docs: {public_url}/docs")
print(f"‚ù§Ô∏è Health Check: {public_url}/health")
print(f"{'='*60}\n")

# Start FastAPI server in background thread
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

print("‚úÖ FastAPI server started")
print("\n‚ö†Ô∏è Keep this cell running! The server will stop if you interrupt it.")

## 8Ô∏è‚É£ Test the API

In [None]:
# Get the public URL
import requests
import time

# Wait for server to be ready
time.sleep(3)

# Get public URL from ngrok
tunnels = ngrok.get_tunnels()
if tunnels:
    API_URL = tunnels[0].public_url
    print(f"API URL: {API_URL}")
    
    # Test health endpoint
    try:
        response = requests.get(f"{API_URL}/health", timeout=10)
        print(f"\n‚úÖ Health Check: {response.json()}")
    except Exception as e:
        print(f"‚ö†Ô∏è Health check failed: {e}")
else:
    print("‚ö†Ô∏è No ngrok tunnel found. Make sure the server is running.")

## 9Ô∏è‚É£ Upload Dataset

**Important Workflow:**
1. Upload dataset ‚Üí Get `job_id`
2. Use the same `job_id` when starting training
3. This ensures the training job can find your uploaded dataset

In [None]:
# Upload the test dataset
with open('example_data.jsonl', 'rb') as f:
    files = {'file': ('example_data.jsonl', f, 'application/json')}
    response = requests.post(f"{API_URL}/upload", files=files)

if response.status_code == 200:
    result = response.json()
    print("‚úÖ Dataset uploaded successfully!")
    print(f"Job ID: {result['job_id']}")
    print(f"Filename: {result['filename']}")
    
    # Save job_id for next step
    UPLOAD_JOB_ID = result['job_id']
else:
    print(f"‚ùå Upload failed: {response.text}")

## üîü Start Training Job

In [None]:
# Start a training job using the uploaded job_id
train_payload = {
    "dataset_filename": "example_data.jsonl",
    "job_id": UPLOAD_JOB_ID,  # ‚ö†Ô∏è IMPORTANT: Use the job_id from upload!
    "config": {
        "base_model": "unsloth/llama-3-8b-bnb-4bit",
        "max_steps": 20,  # Small number for testing
        "batch_size": 2,
        "learning_rate": 0.0002,
        "lora_r": 16
    }
}

response = requests.post(f"{API_URL}/train", json=train_payload)

if response.status_code == 200:
    result = response.json()
    print("‚úÖ Training started successfully!")
    print(f"Job ID: {result['job_id']}")
    print(f"Status: {result['status']}")
    
    # Save for monitoring
    TRAIN_JOB_ID = result['job_id']
else:
    print(f"‚ùå Training failed to start: {response.json()}")

## 1Ô∏è‚É£1Ô∏è‚É£ Monitor Training Progress

In [None]:
# Monitor training progress
import time
from IPython.display import clear_output

print(f"Monitoring job: {TRAIN_JOB_ID}\n")

while True:
    try:
        response = requests.get(f"{API_URL}/status/{TRAIN_JOB_ID}")
        
        if response.status_code == 200:
            status = response.json()
            
            clear_output(wait=True)
            print(f"Job ID: {TRAIN_JOB_ID}")
            print(f"Status: {status['status']}")
            
            if status.get('progress'):
                print(f"Progress: {status['progress']}%")
            
            if status.get('current_step') and status.get('total_steps'):
                print(f"Steps: {status['current_step']}/{status['total_steps']}")
            
            # Check if completed or failed
            if status['status'] == 'completed':
                print("\n‚úÖ Training completed!")
                if status.get('result'):
                    result = status['result']
                    print(f"Duration: {result.get('training_duration_seconds', 0):.2f}s")
                    print(f"Output: {result.get('adapter_dir', 'N/A')}")
                break
            
            elif status['status'] == 'failed':
                print("\n‚ùå Training failed!")
                print(f"Error: {status.get('error', 'Unknown error')}")
                break
        
        time.sleep(5)  # Poll every 5 seconds
    
    except KeyboardInterrupt:
        print("\n‚ö†Ô∏è Monitoring stopped (job still running)")
        break
    except Exception as e:
        print(f"Error checking status: {e}")
        break

## 1Ô∏è‚É£2Ô∏è‚É£ Download Trained Model

In [None]:
# Download the trained model
response = requests.get(f"{API_URL}/download/{TRAIN_JOB_ID}")

if response.status_code == 200:
    result = response.json()
    print("‚úÖ Model ready for download!")
    print(f"Location: {result['model_path']}")
    print(f"Files: {result['files']}")
    
    # Zip and download
    model_path = result['model_path']
    !zip -r trained_adapter.zip {model_path}
    
    from google.colab import files
    files.download('trained_adapter.zip')
    print("\nüì• Download started!")
else:
    print(f"‚ùå Download failed: {response.text}")

## 1Ô∏è‚É£3Ô∏è‚É£ API Information & Commands

In [None]:
# Display API information
print("="*60)
print("üåê Your API is Running!")
print("="*60)
print(f"\nPublic URL: {API_URL}")
print(f"API Docs: {API_URL}/docs")
print(f"Health: {API_URL}/health")
print("\n" + "="*60)
print("üì° cURL Examples:")
print("="*60)
print(f"""
# Upload dataset
curl -X POST "{API_URL}/upload" \\
  -F "file=@your_data.jsonl"

# Start training
curl -X POST "{API_URL}/train" \\
  -H "Content-Type: application/json" \\
  -d '{{
    "dataset_filename": "your_data.jsonl",
    "config": {{
      "max_steps": 100,
      "batch_size": 2
    }}
  }}'

# Check status
curl "{API_URL}/status/YOUR_JOB_ID"

# List all jobs
curl "{API_URL}/jobs"
""")
print("="*60)

## üõ†Ô∏è Utility: View Worker Logs

In [None]:
# Check Celery worker status
!celery -A phase2.celery_config inspect active

## üõ†Ô∏è Utility: Stop Services

In [None]:
# Stop all services
!pkill -f "celery worker"
ngrok.kill()
print("‚úÖ All services stopped")

---

## üìù Notes

### Important:
- ‚ö†Ô∏è **Keep cell 7 running** - FastAPI server runs there
- üîí **ngrok URL changes** each time you restart
- üíæ **Colab storage is temporary** - download your models!
- ‚è±Ô∏è **Colab timeout** - Free tier disconnects after ~12 hours

### Tips:
1. **Save your work**: Download trained models immediately
2. **Use secrets**: Add tokens to Colab Secrets (üîë icon)
3. **Monitor resources**: Check GPU usage with `!nvidia-smi`
4. **Longer training**: Increase `max_steps` for better results

### Troubleshooting:
- **Worker not starting**: Check Redis with `!redis-cli ping`
- **API not accessible**: Check ngrok tunnel in cell 7
- **Out of memory**: Reduce `batch_size` or `max_seq_length`
- **Import errors**: Rerun cell 2 to reinstall dependencies

### Resources:
- [Project Repository](https://github.com/YOUR_USERNAME/LLM_Fine-Tuning_Platform)
- [Phase 2 Guide](phase2/PHASE2_GUIDE.md)
- [Unsloth Documentation](https://github.com/unslothai/unsloth)

---

**üéâ Enjoy training your models in the cloud!**