> **This notebook sets up our entire project environment in Google Colab with GPU support**
## Before Running This Notebook

1. **Enable GPU**: Go to `Runtime` → `Change runtime type` → `Hardware accelerator` → `T4 GPU`
2. **Connect Drive** (optional): To save large model files between sessions

---

## Step 1: Downloading GitHub repo

In [None]:
import os

# Remove existing directory if it exists (for re-running)
if os.path.exists('LLM-interface-optimization'):
    !rm -rf LLM-interface-optimization

# Clone the repo
!git clone https://github.com/Bekmukhamed/LLM-interface-optimization.git

# Change to project directory
os.chdir('LLM-interface-optimization')

# List files to verify download
print("Project files downloaded:")
!ls -la

## Step 2: Install Python Dependencies

In [None]:
# Installing specific requirements
print("Installing project dependencies...")
!pip install -r requirements.txt

# Installing additional Colab-specific packages
print("\nInstalling Colab-specific optimizations...")
!pip install accelerate>=0.24.0  # GPU acceleration
!pip install xformers>=0.0.20    # Memory-efficient transformers (when available)

print("\nPackage installation complete!")

## Step 3: Verifying GPU Setup

In [None]:
import torch
import subprocess

print("GPU SETUP VERIFICATION")
print("=" * 40)

# Check if CUDA is available
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    current_gpu = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_gpu)
    gpu_memory = torch.cuda.get_device_properties(current_gpu).total_memory / 1e9
    
    print(f"GPU Status: AVAILABLE")
    print(f"GPU Name: {gpu_name}")
    print(f"GPU Count: {gpu_count}")
    print(f"GPU Memory: {gpu_memory:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
    
    # Test GPU computation
    test_tensor = torch.randn(1000, 1000).cuda()
    result = torch.matmul(test_tensor, test_tensor)
    print(f"GPU computation test: SUCCESS")
    
else:
    print("GPU Status: NOT AVAILABLE")
    print("Hot to fix: Go to Runtime → Change runtime type → Hardware accelerator → GPU")

# Show detailed GPU info
print("\n📋 Detailed GPU Information:")
!nvidia-smi

## Step 4: Run Environment Tests

In [None]:
# Run our environment verification script
print("Running comprehensive environment tests...")
print()

!python test_environment.py

## Step 5: Load Your First LLM Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

print("LOADING FIRST LLM MODEL")
print("=" * 40)

# Small fast model
model_name = "distilgpt2"  # Small version of GPT-2 (82M parameters)
print(f"Downloading model: {model_name}")
print("(This may take 1-2 minutes on first run)")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Model loaded successfully!")
print(f"Running on: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Add padding token (required for some operations)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Step 6: First Text Generation Test

In [None]:
def generate_text(prompt, max_length=50):
    """Generating text and measuring performance"""
    print(f"Input prompt: '{prompt}'")
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Measure generation time
    start_time = time.time()
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    end_time = time.time()
    
    # Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Calculate metrics
    generation_time = end_time - start_time
    tokens_generated = len(outputs[0]) - len(inputs.input_ids[0])
    tokens_per_second = tokens_generated / generation_time
    
    print(f"Generated text: '{generated_text}'")
    print(f"Generation time: {generation_time:.3f} seconds")
    print(f"Tokens generated: {tokens_generated}")
    print(f"Speed: {tokens_per_second:.1f} tokens/second")
    print()
    
    return {
        'text': generated_text,
        'time': generation_time,
        'tokens': tokens_generated,
        'speed': tokens_per_second
    }

# Test with different prompts
print("BASELINE PERFORMANCE TESTING")
print("=" * 50)

test_prompts = [
    "Hello, ",
    "The future of artificial intelligence is",
    "The best way to optimize neural networks"
]

results = []
for i, prompt in enumerate(test_prompts, 1):
    print(f"\nTest {i}/3:")
    result = generate_text(prompt)
    results.append(result)

# Summary statistics
avg_speed = sum(r['speed'] for r in results) / len(results)
avg_time = sum(r['time'] for r in results) / len(results)

print("\nBASELINE PERFORMANCE SUMMARY")
print("=" * 40)
print(f"Average speed: {avg_speed:.1f} tokens/second")
print(f"Average time: {avg_time:.3f} seconds")
print(f"Device used: {device}")
print()
print("NEXT STEPS: Now we'll work on optimizing this speed!")

## Step 7: Save Environment State

In [None]:
import json
from datetime import datetime

# Create environment report
env_report = {
    'timestamp': datetime.now().isoformat(),
    'platform': 'Google Colab',
    'device': str(device),
    'cuda_available': torch.cuda.is_available(),
    'torch_version': torch.__version__,
}

if torch.cuda.is_available():
    env_report.update({
        'gpu_name': torch.cuda.get_device_name(0),
        'gpu_memory_gb': torch.cuda.get_device_properties(0).total_memory / 1e9,
        'cuda_version': torch.version.cuda
    })

# Save to file
with open('colab_environment.json', 'w') as f:
    json.dump(env_report, f, indent=2)

print("Environment state saved to 'colab_environment.json'")
print("Environment summary:")
for key, value in env_report.items():
    print(f"   {key}: {value}")

## Setup Complete

Google Colab environment is now ready for LLM optimization experiments.

- Downloaded project from GitHub
- Installed all dependencies
- Verified GPU access
- Loaded your first LLM model
- Measured baseline performance

### Notebooks
- `02_model_optimization.ipynb` - TensorRT optimization
- `03_custom_kernels.ipynb` - CUDA kernel development
- `04_benchmarking.ipynb` - Performance measurement

---