## Chapter 2

```bash
**`Step 1: Complete pyenv setup in Codespaces`**
# Check if the configuration was added
cat ~/.bashrc | grep pyenv
# Restart your shell session
exec bash
# OR reload the configuration
source ~/.bashrc
# Now check pyenv
pyenv --version

**`Step 2: Install Python 3.11.8 and Poetry`**
# Install Python 3.11.8
pyenv install 3.11.8
# List available versions
pyenv versions

# In the new terminal, check if Poetry is installed
poetry –version
# Install Poetry (this doesn't require sudo)
curl -sSL https://install.python-poetry.org | python3 –
# Add Poetry to PATH
echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc
# Check if Poetry works
poetry --version
# Set Python 3.11.8 as the global default
pyenv global 3.11.8
# Verify it's now active
python --version
python3 --version

**`Step3: Navigate to your project and set local version`** 
# Go to your project directory
cd /workspaces/llm-twin-replicate
# To create the .python-version file, you must run
pyenv local 3.11.8
# Verify
python –version

**`Step 4: Installing the poe the poet`**
# Install Poe the Poet
poetry self add 'poethepoet[poetry_plugin]'
```

## What is pyproject.toml?
`pyproject.toml` is a configuration file that defines:

- **Project metadata** (name, version, description, author)
- **Dependencies** (what Python packages your project needs)
- **Development dependencies** (tools for testing, formatting, etc.)
- **Build system** (how to package your project)
- **Tool configurations** (settings for various development tools)

It's the modern Python standard for project configuration (replacing older files like `setup.py` and `requirements.txt`).

We need to create the `pyproject.toml` file yourself - it's a crucial part of any Python project that uses Poetry for dependency management.

### Understanding the `pyproject.toml` file for the llm-twin project

```bash
[tool.poetry]
name = "llm-engineering"
version = "0.1.0"
description = ""
authors = ["iusztinpaul <p.b.iusztin@gmail.com>"]
license = "MIT"
readme = "README.md"

[tool.poetry.dependencies]
python = "~3.11"
zenml = { version = "0.74.0", extras = ["server"] }
pymongo = "^4.6.2"
click = "^8.0.1"
loguru = "^0.7.2"
rich = "^13.7.1"
numpy = "^1.26.4"
poethepoet = "0.29.0"
datasets = "^3.0.1"
torch = "2.2.2"

# Digital data ETL
selenium = "^4.21.0"
webdriver-manager = "^4.0.1"
beautifulsoup4 = "^4.12.3"
html2text = "^2024.2.26"
jmespath = "^1.0.1"
chromedriver-autoinstaller = "^0.6.4"

# Feature engineering
qdrant-client = "^1.8.0"
langchain = "^0.2.11"
sentence-transformers = "^3.0.0"

# RAG
langchain-openai = "^0.1.3"
jinja2 = "^3.1.4"
tiktoken = "^0.7.0"
fake-useragent = "^1.5.1"
langchain-community = "^0.2.11"

# Inference
fastapi = ">=0.100,<=0.110"
uvicorn = "^0.30.6"
opik = "^0.2.2"


[tool.poetry.group.dev.dependencies]
ruff = "^0.4.9"
pre-commit = "^3.7.1"
pytest = "^8.2.2"


[tool.poetry.group.aws.dependencies]
sagemaker = ">=2.232.2"
s3fs = ">2022.3.0"
aws-profile-manager = "^0.7.3"
kubernetes = "^30.1.0"
sagemaker-huggingface-inference-toolkit = "^2.4.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

# ----------------------------------
# --- Poe the Poet Configuration ---
# ----------------------------------

[tool.poe.tasks]
# Data pipelines
run-digital-data-etl-alex = "echo 'It is not supported anymore.'"
run-digital-data-etl-maxime = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml"
run-digital-data-etl-paul = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml"
run-digital-data-etl = [
    "run-digital-data-etl-maxime",
    "run-digital-data-etl-paul",
]
run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering"
run-generate-instruct-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-instruct-datasets"
run-generate-preference-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-preference-datasets"
run-end-to-end-data-pipeline = "poetry run python -m tools.run --no-cache --run-end-to-end-data"

# Utility pipelines
run-export-artifact-to-json-pipeline = "poetry run python -m tools.run --no-cache --run-export-artifact-to-json"
run-export-data-warehouse-to-json = "poetry run python -m tools.data_warehouse --export-raw-data"
run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse --import-raw-data"

# Training pipelines
run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"

# Inference
call-rag-retrieval-module = "poetry run python -m tools.rag"

run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload"
call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' -H 'Content-Type: application/json' -d '{\"query\": \"My name is Paul Iusztin. Could you draft a LinkedIn post discussing RAG systems? I am particularly interested in how RAG works and how it is integrated with vector DBs and LLMs.\"}'"

# Infrastructure
## Local infrastructure
local-docker-infrastructure-up = "docker compose up -d"
local-docker-infrastructure-down = "docker compose stop"
local-zenml-server-down = "poetry run zenml logout --local"
local-infrastructure-up = [
    "local-docker-infrastructure-up",
    "local-zenml-server-down",
    "local-zenml-server-up",
]
local-infrastructure-down = [
    "local-docker-infrastructure-down",
    "local-zenml-server-down",
]
set-local-stack = "poetry run zenml stack set default"
set-aws-stack = "poetry run zenml stack set aws-stack"
set-asynchronous-runs = "poetry run zenml orchestrator update aws-stack --synchronous=False"
zenml-server-disconnect = "poetry run zenml disconnect"

## Settings
export-settings-to-zenml = "poetry run python -m tools.run --export-settings"
delete-settings-zenml = "poetry run zenml secret delete settings"

## SageMaker
create-sagemaker-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_sagemaker_role"
create-sagemaker-execution-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_execution_role"
deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run"
test-sagemaker-endpoint = "poetry run python -m llm_engineering.model.inference.test"
delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint"

## Docker
build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ."
run-docker-end-to-end-data-pipeline = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe --no-cache --run-end-to-end-data"
bash-docker-container = "docker run --rm -it --network host --env-file .env llmtwin bash"

# QA
lint-check = "poetry run ruff check ."
format-check = "poetry run ruff format --check ."
lint-check-docker = "sh -c 'docker run --rm -i hadolint/hadolint < Dockerfile'"
gitleaks-check = "docker run -v .:/src zricethezav/gitleaks:latest dir /src/llm_engineering"
lint-fix = "poetry run ruff check --fix ."
format-fix = "poetry run ruff format ."

[tool.poe.tasks.local-zenml-server-up]
control.expr = "sys.platform"

[[tool.poe.tasks.local-zenml-server-up.switch]]
case = "darwin"
env = { OBJC_DISABLE_INITIALIZE_FORK_SAFETY = "YES" }
cmd = "poetry run zenml login --local"

[[tool.poe.tasks.local-zenml-server-up.switch]]
case = "win32"
cmd = "poetry run zenml login --local --blocking"

[[tool.poe.tasks.local-zenml-server-up.switch]]
cmd = "poetry run zenml login --local"

# Tests
[tool.poe.tasks.test]
cmd = "poetry run pytest tests/"
env = { ENV_FILE = ".env.testing" }
```

### Build System Configuration
```bash
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
```

**What this does**:
- Tells Python how to **build/package** your project when you want to distribute it
- `poetry-core` is the tool that handles the building process
- `poetry.core.masonry.api` is the specific backend that creates wheel files (.whl) and source distributions
- This is required by Python's PEP 517/518 standards for modern packaging

**When you need it**:
- When you want to publish your package to PyPI
- When someone wants to install your project with pip install
- For proper project structure and standards compliance

### Tool Configurations
```bash
[tool.black]
line-length = 88
target-version = ['py311']

[tool.isort]
profile = "black"
line_length = 88

[tool.mypy]
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
```

**What these do**:
- **[tool.black]**: Configures the Black code formatter
    - `line-length = 88`: Maximum characters per line
    - `target-version = ['py311']`: Format code for Python 3.11
- **[tool.isort]**: Configures import statement sorting
    - `profile = "black"`: Make it compatible with Black formatting
    - `line_length = 88`: Match Black's line length
- **[tool.mypy]**: Configures type checking
    - `python_version = "3.11"`: Target Python version for type checking
    - `warn_return_any = true`: Warn about functions returning Any type
    - `warn_unused_configs = true`: Warn about unused mypy settings

**When you need them**:
- Only when you actually use these tools
- They provide consistent settings across your team
- You can delete these sections if you don't use the tools

### Poetry Add vs Pre-defining Dependencies
We have two approaches:

1. **Approach 1: Start minimal and add as needed**
```bash
# Start with just basic info
[tool.poetry]
name = "llm-twin-replicate"
version = "0.1.0"
description = "..."
authors = ["..."]

[tool.poetry.dependencies]
python = "^3.11"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
```

Then add packages one by one:
```bash
poetry add fastapi
poetry add uvicorn
poetry add openai
poetry add --group dev pytest
poetry add --group dev black
```

2. **Approach 2: Pre-define common dependencies**
- List packages you know you'll need
- Faster initial setup
- Good for following tutorials/courses where dependencies are known

### Key components explained:
- **[tool.poetry]**: Basic project info
- **[tool.poetry.dependencies]**: Packages needed to run your app
- **[tool.poetry.group.dev.dependencies]**: Development tools (testing, formatting)
- **[tool.poetry.group.aws]**: Optional AWS-specific dependencies
- **[tool.poe.tasks]**: Custom commands you can run with poe <task-name>

Let's break down each section and explain what it does:

### 📋 Basic Project Information
```bash
[tool.poetry]
name = "llm-engineering"
version = "0.1.0"
description = ""
authors = ["iusztinpaul <p.b.iusztin@gmail.com>"]
license = "MIT"
readme = "README.md"
```

- **Project metadata**: Name, version, author info
- **MIT license**: Open source license allowing commercial use
- This is the foundation of any Python project

### 🐍 Core Dependencies
```bash
[tool.poetry.dependencies]
python = "~3.11"  # Exactly Python 3.11.x (not 3.12+)
```

### MLOps & Orchestration
```bash
zenml = { version = "0.74.0", extras = ["server"] }  # ML pipeline orchestration
```

- **ZenML**: Manages ML pipelines, tracks experiments, handles model deployment

### Data Processing & ML
```bash
datasets = "^3.0.1"      # Hugging Face datasets
torch = "2.2.2"          # PyTorch for deep learning
numpy = "^1.26.4"        # Numerical computing
```

### Web Scraping & Data Collection
```bash
selenium = "^4.21.0"                    # Browser automation
webdriver-manager = "^4.0.1"           # Manages browser drivers
beautifulsoup4 = "^4.12.3"             # HTML parsing
html2text = "^2024.2.26"               # HTML to text conversion
chromedriver-autoinstaller = "^0.6.4"  # Auto Chrome driver setup
```

- These tools scrape digital content (LinkedIn posts, articles, etc.) to build your personal data

### Vector Database & Embeddings
```bash
qdrant-client = "^1.8.0"           # Vector database client
sentence-transformers = "^3.0.0"   # Text embeddings
```

- **Qdrant**: Stores vector embeddings of your content
- **Sentence Transformers**: Converts text to numerical vectors

### LLM & RAG (Retrieval-Augmented Generation)
```bash
langchain = "^0.2.11"           # LLM application framework
langchain-openai = "^0.1.3"     # OpenAI integration
langchain-community = "^0.2.11" # Community extensions
tiktoken = "^0.7.0"             # OpenAI tokenizer
jinja2 = "^3.1.4"               # Template engine
```

- **LangChain**: Framework for building LLM applications
- **RAG**: Combines your personal data with LLM responses

### API & Web Service
```bash
fastapi = ">=0.100,<=0.110"  # Modern web API framework
uvicorn = "^0.30.6"          # ASGI server
```

### Utilities
```bash
click = "^8.0.1"        # Command-line interface
loguru = "^0.7.2"       # Advanced logging
rich = "^13.7.1"        # Beautiful terminal output
poethepoet = "0.29.0"   # Task runner
opik = "^0.2.2"         # ML observability
```

### 🛠️ Development Dependencies
```bash
[tool.poetry.group.dev.dependencies]
ruff = "^0.4.9"        # Fast Python linter & formatter
pre-commit = "^3.7.1"  # Git hooks for code quality
pytest = "^8.2.2"      # Testing framework
```

- **Ruff**: Super fast replacement for Black, isort, flake8
- **Pre-commit**: Runs checks before Git commits
- **Pytest**: Industry-standard testing

### ☁️ AWS Cloud Dependencies
```bash
[tool.poetry.group.aws.dependencies]
sagemaker = ">=2.232.2"  # AWS ML platform
s3fs = ">2022.3.0"       # S3 filesystem interface
kubernetes = "^30.1.0"   # Container orchestration
```

- **Optional group**: Only installed when deploying to AWS
- **SageMaker**: AWS's managed ML platform for training and inference

### 🎯 Poe the Poet Tasks (The Magic!)
This is where the real power lies - **automated workflows:**

### Data Collection Pipelines
```bash
run-digital-data-etl-maxime = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml"
```

- Scrapes data from specific people (Maxime Labonne, Paul Iusztin)
- Creates personalized datasets

### Feature Engineering
```bash
run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering"
```

- Processes raw data into features for ML training
- Creates embeddings and vector representations

### Training Pipelines
```bash
run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"
```

- Trains your personalized LLM
- Evaluates model performance

### Inference & API
```bash
run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload"
call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' ..."
```

- Starts the API server
- Tests the RAG system with sample queries

### Infrastructure Management
```bash
local-infrastructure-up = [
    "local-docker-infrastructure-up",
    "local-zenml-server-down", 
    "local-zenml-server-up",
]
```

- **Composite tasks**: Runs multiple commands in sequence
- Manages Docker containers, ZenML server, databases

### Cloud Deployment
```bash
deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run"
delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint"
```

- Deploys your model to AWS SageMaker
- Manages cloud infrastructure

### 🔍 Key Insights
This project structure shows:
1. **`Complete MLOps pipeline`***: Data → Processing → Training → Deployment
2. **`Production-ready`**: Proper testing, linting, CI/CD setup
3. **`Multi-environment`**: Local development + AWS cloud deployment
4. **`Automated workflows`**: Everything scriptable with `poe <task-name>`
5. **`Modern Python practices`**: Poetry, Ruff, proper dependency management

### 🚀 How You'd Use This
```bash
# Set up the project
poetry install

# Collect your digital data
poe run-digital-data-etl-paul

# Process the data
poe run-feature-engineering-pipeline

# Train your LLM twin
poe run-training-pipeline

# Start the API
poe run-inference-ml-service

# Deploy to cloud
poe deploy-inference-endpoint
```

## `ruff.toml` replaces the tool configurations in `pyproject.toml`
**What's happening here**:
1. This project uses Ruff instead of `Black/isort/flake8`
2. `Ruff` is a modern, all-in-one tool that replaces multiple separate tools
3. The configuration is in `ruff.toml` instead of `pyproject.toml`

**Old approach (what I showed earlier):**
```bash
# pyproject.toml
[tool.black]         # Code formatter
[tool.isort]         # Import sorter  
[tool.flake8]        # Linter
[tool.mypy]          # Type checker
```

- 4 separate tools
- Slower (each tool runs separately)
- More configuration needed

**Modern approach (this project):**
```bash
# ruff.toml  
line-length = 120    # Replaces Black
[lint]               # Replaces flake8
[lint.isort]         # Replaces isort
```

- **1 tool (Ruff)** does formatting + linting + import sorting
- **Much faster** (written in `Rust`)
- **Simpler configuration**

### Current `ruff.toml` file
```bash
line-length = 120
target-version = "py311"
extend-exclude = [
    ".github",
    "graphql_client",
    "graphql_schemas"
]

[lint]
extend-select = [
  "I",
  "B",
  "G",
  "T20",
  "PTH",
  "RUF"
]

[lint.isort]
case-sensitive = true

[lint.pydocstyle]
convention = "google"
```

**Basic Configuration:**
```bash
line-length = 120        # Max characters per line (like Black)
target-version = "py311" # Target Python version
extend-exclude = [...]   # Folders to ignore
```

**Linting Rules:**
```bash
[lint]
extend-select = [
  "I",     # isort - import sorting
  "B",     # flake8-bugbear - bug detection
  "G",     # flake8-logging-format - logging best practices
  "T20",   # flake8-print - detect print statements
  "PTH",   # flake8-use-pathlib - use pathlib instead of os.path
  "RUF"    # Ruff-specific rules
]
```

**Import Sorting (replaces isort):**
```bash
[lint.isort]
case-sensitive = true  # Sort imports case-sensitively
```

**Documentation Style:**
```bash
[lint.pydocstyle]
convention = "google"  # Use Google-style docstrings
```

**Why this project doesn't need the pyproject.toml tool configs:**
Looking back at the dependencies:
```bash
[tool.poetry.group.dev.dependencies]
ruff = "^0.4.9"        # ✅ Only Ruff (no Black, isort, flake8)
pre-commit = "^3.7.1"  
pytest = "^8.2.2"
```

- Notice: No Black, isort, or flake8 - just Ruff!

**Running the tools:**
```bash
# With the old approach:
black .
isort .
flake8 .
mypy .

# With Ruff (this project):
ruff format .    # Replaces Black
ruff check .     # Replaces flake8 + isort
# Still need mypy separately for type checking
```

**Poe tasks that use these:**
```bash
lint-check = "poetry run ruff check ."      # Check for issues
format-check = "poetry run ruff format --check ."  # Check formatting
lint-fix = "poetry run ruff check --fix ."   # Auto-fix issues  
format-fix = "poetry run ruff format ."     # Auto-format code
```


** Check `Poe the poet` installed version with Poetry
```bash
####### Method 1: Check installed version with Poetry #######
# If installed as a Poetry plugin
poetry self show poethepoet

# Or check all Poetry plugins
poetry self show

####### Method 2: Direct command line check #######
# Check version directly
poe --version

# Or
poetry poe --version

####### Method 3: Check in your pyproject.toml #######
# List Poetry plugins
poetry self show plugins

# Check specific plugin info
poetry self show poethepoet
```

Let me walk you through creating the ruff.toml file step by step:

## Step 1: Create the ruff.toml file
```bash
# In your project root directory (same level as pyproject.toml)
touch ruff.toml
```

## Step 2: Add the configuration content
Open the ruff.toml file in VS Code and paste the content from the artifact I created earlier:
```bash
# Ruff configuration for LLM Twin project
line-length = 120
target-version = "py311"

# Exclude common directories that don't need linting
extend-exclude = [
    ".github",
    ".venv",
    "__pycache__",
    "*.egg-info",
    ".mypy_cache",
    ".pytest_cache"
]

[lint]
# Enable specific rule sets
extend-select = [
  "I",    # isort - import sorting
  "B",    # flake8-bugbear - bug detection  
  "G",    # flake8-logging-format - logging best practices
  "T20",  # flake8-print - detect print statements (avoid print in production)
  "PTH",  # flake8-use-pathlib - use pathlib instead of os.path
  "RUF"   # Ruff-specific rules
]

[lint.isort]
case-sensitive = true

[lint.pydocstyle]
convention = "google"  # Use Google-style docstrings
```

## Step 3: Ensure Ruff is in your pyproject.toml
Make sure your pyproject.toml has Ruff in the dev dependencies:
```bash
[tool.poetry.group.dev.dependencies]
ruff = "^0.4.9"
pytest = "^8.2.2"
pre-commit = "^3.7.1"
```

## Step 4: Install Ruff
```bash
# Install all dependencies including dev group
poetry install

# Or specifically add Ruff if it's not in your pyproject.toml yet
poetry add --group dev ruff
```

## Step 5: Test Ruff installation
```bash
# Check if Ruff is installed
poetry run ruff --version

# Check your configuration
poetry run ruff check --show-settings
```

## Step 6: Add Poe the Poet tasks to pyproject.toml
Add these task configurations to your `pyproject.toml`:
```bash
[tool.poe.tasks]
# Code quality tasks
lint-check = "poetry run ruff check ."
format-check = "poetry run ruff format --check ."
lint-fix = "poetry run ruff check --fix ."
format-fix = "poetry run ruff format ."

# Combined tasks
lint = ["lint-check", "format-check"]
fix = ["lint-fix", "format-fix"]

# Development setup
install = "poetry install"
install-dev = "poetry install --with dev"
```

## Step 8: Create a sample Python file to test
```bash
# Create a test file
touch test_ruff.py

# Add some intentionally "bad" code to test:
import os
import sys
import json


def hello_world():
    print("Hello World")
    x=1+2
    return x
```

## Step 9: Run Ruff on the test file
```bash
# Check for issues
poe lint-check

# Fix formatting
poe format-fix

# Check the file after formatting
cat test_ruff.py
```

## File structure should look like:
```bash
your-project/
├── pyproject.toml
├── ruff.toml          # ← New file you just created
├── README.md
├── test_ruff.py       # ← Test file
└── .gitignore
```

## Pro tip: Add to your workflow
```bash
# Before committing code, always run:
poe fix        # Auto-fix and format
poe lint       # Final check

# Or create a pre-commit hook (advanced)
poetry add --group dev pre-commit
```