In [None]:
# ==============================================================================
#  THE IMMORTALITY PROTOCOL: HRF TITAN-26 (BIOLOGICAL TIME REGRESSION)
#  AUTHOR: PRINCE NIK (2026)
#  TARGET: MAE < 1.0 YEAR | R > 0.99
# ==============================================================================

# 1. SYSTEM PREPARATION & GPU CHECK
import subprocess
import sys
import os

def install_dependencies():
    print("‚ö° INSTALLING BIOINFORMATICS & GPU STACK...")
    packages = ["GEOparse", "fastparquet", "h5py"]
    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

    # Check for RAPIDS (If not present, this script assumes a standard Env)
    try:
        import cuml
        import cupy as cp
        print(f"‚úÖ NVIDIA RAPIDS DETECTED. GPU: {cp.cuda.runtime.getDeviceCount()} active.")
    except ImportError:
        print("‚ö†Ô∏è RAPIDS NOT FOUND. Please ensure you are in a GPU environment (Colab T4/A100).")

# Run Installation
install_dependencies()

# 2. IMPORTS
import GEOparse
import pandas as pd
import numpy as np
import cupy as cp
from cuml import LinearRegression as cuLinearRegression # For quick baseline checks
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

# 3. DATA INGESTION ENGINE (GSE40279)
def load_hannum_dataset(dest_dir="./"):
    """
    Downloads and parses GSE40279 (Hannum Aging Dataset).
    Returns:
        X (Methylation Beta Matrix): shape (samples, cpg_sites)
        y (Biological Age): shape (samples,)
    """
    dataset_id = "GSE40279"
    print(f"\nüß¨ INITIATING CONNECTION TO NCBI GEO: {dataset_id}...")

    try:
        gse = GEOparse.get_GEO(geo=dataset_id, destdir=dest_dir, silent=True)
        print("‚úÖ DATASET DOWNLOADED & PARSED.")
    except Exception as e:
        print(f"‚ùå DOWNLOAD FAILED: {e}")
        return None, None

    # --- EXTRACT METADATA (AGE) ---
    print("   -> Extracting Clinical Metadata...")
    meta = gse.phenotype_data

    # Auto-detect 'age' column (it is usually 'age:ch1' in this dataset)
    age_col = next((col for col in meta.columns if 'age' in col.lower()), None)

    if age_col:
        print(f"   -> Found Age Column: '{age_col}'")
        y = meta[age_col].astype(float).values
    else:
        raise ValueError("CRITICAL: Age column not found in phenotype data.")

    # --- EXTRACT METHYLATION MATRIX (BETAS) ---
    # --- EXTRACT METHYLATION MATRIX (BETAS) ---
    print("   -> Pivoting Methylation Matrix (This may take RAM)...")
    X = gse.pivot_samples('VALUE').T

    # [UPDATED] CRITICAL: HANDLE MISSING VALUES
    # 450k arrays often have dropped beads. We impute before GPU transfer.
    if X.isnull().values.any():
        print("   ‚ö†Ô∏è NaNs detected. Performing fast mean imputation...")
        X = X.fillna(X.mean())

    # --- SANITY CHECK ---
    print(f"\nüìä DATA SHAPE REPORT:")
    print(f"   [SAMPLES]: {X.shape[0]} (Should be ~656)")
    print(f"   [FEATURES]: {X.shape[1]} (CpG Sites - Should be ~450k)")
    print(f"   [AGE RANGE]: {np.min(y):.1f} - {np.max(y):.1f} Years")

    return X, y

# 4. EXECUTION
if __name__ == "__main__":
    X_raw, y_raw = load_hannum_dataset()

    # QUICK VISUAL CHECK
    print("\nüîç SAMPLE BETA VALUES (First 5 samples, First 5 CpGs):")
    print(X_raw.iloc[:5, :5])

‚ö° INSTALLING BIOINFORMATICS & GPU STACK...
‚úÖ NVIDIA RAPIDS DETECTED. GPU: 1 active.

üß¨ INITIATING CONNECTION TO NCBI GEO: GSE40279...


  return read_csv(StringIO(data), index_col=None, sep="\t")


In [None]:
# ==============================================================================
#  THE LAZARUS PROTOCOL: AGE REVERSAL DATASET (GSE60821)
#  CONTEXT: iPSC Reprogramming (Adult -> Embryonic State)
#  TARGET: Prove Biological Age Reset (Age X -> Age 0)
# ==============================================================================

import subprocess
import sys
import pandas as pd
import numpy as np
import io
import requests
import gc

# --- 1. INSTALL & SETUP ---
print("‚ö° INITIALIZING BIO-LINK...")
try:
    import GEOparse
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "-q"])
    import GEOparse

# --- 2. INSTANT DOWNLOADER (SERIES MATRIX) ---
def load_lazarus_data():
    print("\nüß¨ TARGETING GSE60821 (iPSC AGE REVERSAL)...")

    # Download Series Matrix (Small file, ~10-15MB)
    # This skips the raw IDATs and gets the processed Beta Matrix
    gse = GEOparse.get_GEO(geo="GSE60821", destdir="./", silent=True)

    print("   ‚úÖ DOWNLOAD COMPLETE.")

    # --- 3. EXTRACTION & CLEANING ---
    print("   -> Parsing Metadata (Donor Age vs. Cell Type)...")
    meta = gse.phenotype_data

    # Extract relevant columns: Cell Type and Source Tissue
    # We want to distinguish 'iPSC' (The Immortal) from 'Fibroblast/Blood' (The Mortal)
    # Note: GSE60821 metadata columns may vary, we look for 'source_name_ch1'
    meta['Cell_Type'] = meta['source_name_ch1']

    # Extract Matrix (Betas)
    print("   -> Extracting Methylation Matrix (Samples x CpGs)...")
    X = gse.pivot_samples('VALUE').T.astype('float32')

    # Clean NaNs (common in 450k data)
    X = X.dropna(axis=1, how='any')

    # Align Meta with X
    meta = meta.loc[X.index]

    print(f"\nüìä DATA READY FOR ANALYSIS:")
    print(f"   [SAMPLES] : {X.shape[0]}")
    print(f"   [FEATURES]: {X.shape[1]} (CpG Sites)")
    print(f"   [TYPES]   : {meta['Cell_Type'].unique()}")

    # Cleanup
    del gse
    gc.collect()

    return X, meta

# --- EXECUTE ---
if __name__ == "__main__":
    X, meta = load_lazarus_data()

    # PREVIEW THE "REVERSAL" CANDIDATES
    print("\nüîç SAMPLE PREVIEW:")
    # The original code caused a KeyError because 'characteristics_ch1.1' was not found.
    # To fix this, we will first print all available columns in 'meta'
    # so the user can identify the correct column name for age or other characteristics.
    print("Available metadata columns:")
    print(meta.columns.tolist())
    print("\nPlease identify the column containing age or relevant characteristics from the list above.")
    # For now, we will only preview 'Cell_Type' to avoid the error.
    # Once the correct column name is identified, replace 'Your_Age_Column_Here' with it.
    print(meta[['Cell_Type']].head())
    # Example if 'characteristics_ch1.age' was the column:
    # print(meta[['Cell_Type', 'characteristics_ch1.age']].head())


‚ö° INITIALIZING BIO-LINK...

üß¨ TARGETING GSE60821 (iPSC AGE REVERSAL)...


  return read_csv(StringIO(data), index_col=None, sep="\t")


   ‚úÖ DOWNLOAD COMPLETE.
   -> Parsing Metadata (Donor Age vs. Cell Type)...
   -> Extracting Methylation Matrix (Samples x CpGs)...

üìä DATA READY FOR ANALYSIS:
   [SAMPLES] : 39
   [FEATURES]: 461232 (CpG Sites)
   [TYPES]   : ['human induced pluripotent stem cells' 'human embryonic stem cells']

üîç SAMPLE PREVIEW:
Available metadata columns:
['title', 'geo_accession', 'status', 'submission_date', 'last_update_date', 'type', 'channel_count', 'source_name_ch1', 'organism_ch1', 'taxid_ch1', 'characteristics_ch1.0.cell type', 'growth_protocol_ch1', 'molecule_ch1', 'extract_protocol_ch1', 'label_ch1', 'label_protocol_ch1', 'hyb_protocol', 'scan_protocol', 'description', 'data_processing', 'platform_id', 'contact_name', 'contact_email', 'contact_institute', 'contact_address', 'contact_city', 'contact_zip/postal_code', 'contact_country', 'supplementary_file', 'series_id', 'data_row_count', 'Cell_Type']

Please identify the column containing age or relevant characteristics from the lis

In [None]:
# ==============================================================================
#  CELL 2 HELPER: AUTOMATIC ANCESTRAL AGE DECODER
# ==============================================================================
import re

def extract_ground_truth(meta_df):
    print("üîç DECODING ANCESTRAL AGES (AUTONOMOUS MODE)...")

    # 1. Set Current Biological Age (iPSCs are effectively 0)
    meta_df['Biological_Age'] = 0.0

    # 2. Heuristic Search for Donor Age in Metadata
    # GSE60821 often hides age in 'characteristics_ch1' or description fields
    def find_donor_age(row):
        # Concatenate all text columns for this sample to search globally
        row_text = " ".join(row.astype(str).values)

        # Regex to find patterns like "20y", "20yr", "age: 20"
        # We look for digits followed optionally by 'y' or 'yr'
        match = re.search(r'(?:age[:\s]+|)(\d+)\s*(?:y|yr|years)', row_text, re.IGNORECASE)
        if match:
            return float(match.group(1))
        return np.nan

    meta_df['Donor_Age'] = meta_df.apply(find_donor_age, axis=1)

    # Impute unknown donor ages with population median if needed, or drop
    valid_donors = meta_df['Donor_Age'].dropna()
    print(f"   ‚úÖ Biological Age set to 0.0 (Pluripotent State)")
    print(f"   ‚úÖ Ancestral Donor Ages Extracted. Range: {valid_donors.min()} - {valid_donors.max()} years")

    return meta_df

# --- EXECUTE ---
if __name__ == "__main__":
    # 1. Load Data
    X, meta = load_lazarus_data()

    # 2. Extract Ground Truth (Autonomous)
    meta = extract_ground_truth(meta)

    # 3. Final Compatibility Check
    print("\nüöÄ LAZARUS PROTOCOL READY.")
    print(f"   -> Matrix Shape: {X.shape}")
    print(f"   -> Sample Targets (First 3):")
    print(meta[['Cell_Type', 'Biological_Age', 'Donor_Age']].head(3))


üß¨ TARGETING GSE60821 (iPSC AGE REVERSAL)...


  return read_csv(StringIO(data), index_col=None, sep="\t")


   ‚úÖ DOWNLOAD COMPLETE.
   -> Parsing Metadata (Donor Age vs. Cell Type)...
   -> Extracting Methylation Matrix (Samples x CpGs)...

üìä DATA READY FOR ANALYSIS:
   [SAMPLES] : 39
   [FEATURES]: 461232 (CpG Sites)
   [TYPES]   : ['human induced pluripotent stem cells' 'human embryonic stem cells']
üîç DECODING ANCESTRAL AGES (AUTONOMOUS MODE)...
   ‚úÖ Biological Age set to 0.0 (Pluripotent State)
   ‚úÖ Ancestral Donor Ages Extracted. Range: 13534.0 - 13534.0 years

üöÄ LAZARUS PROTOCOL READY.
   -> Matrix Shape: (39, 461232)
   -> Sample Targets (First 3):
                                       Cell_Type  Biological_Age  Donor_Age
name                                                                       
GSM1489422  human induced pluripotent stem cells             0.0    13534.0
GSM1489423  human induced pluripotent stem cells             0.0    13534.0
GSM1489424  human induced pluripotent stem cells             0.0    13534.0


In [None]:
# ==============================================================================
#  CELL 1: THE PROMETHEUS PROTOCOL (GSE54848 - PROGERIA & REVERSAL)
#  TARGET: Hutchinson-Gilford Progeria Syndrome (HGPS) vs Healthy
#  GOAL: Identify and Reverse Accelerated Aging Vectors
# ==============================================================================

import subprocess
import sys
import pandas as pd
import numpy as np
import gc

# 1. SILENT INSTALLATION (BIO-STACK)
def install_stack():
    try:
        import GEOparse
    except ImportError:
        print("‚ö° INSTALLING IMMORTALITY STACK...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "fastparquet", "-q"])

    # Check for T4 GPU
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        print("‚ö†Ô∏è WARNING: CPU MODE (Titan capabilities restricted).")
        return np

cp = install_stack()
import GEOparse

# 2. THE PROMETHEUS LOADER
def load_prometheus_data(n_best_features=1000):
    """
    Ingests GSE54848 (Progeria).
    This dataset is critical because it contains 'Accelerated Aging' (HGPS).
    If we can model this, we can mathematically 'slow down' time.
    """
    dataset_id = "GSE54848"
    print(f"\nüß¨ INITIATING PROMETHEUS PROTOCOL ({dataset_id})...")

    # Download (Small dataset, very fast)
    gse = GEOparse.get_GEO(geo=dataset_id, destdir="./", silent=True)

    # --- STEP A: METADATA & LABELS ---
    print("   -> Decoding Biological Status...")
    meta = gse.phenotype_data

    # We want to predict Age, but also know who has the 'Disease' (HGPS)
    # The age column is usually 'age:ch1'
    age_col = next((c for c in meta.columns if 'age' in c.lower()), None)

    # --- IMPORTANT FIX: Handle cases where 'age' column might not be found ---
    if age_col is None:
        print("\n‚ùå CRITICAL ERROR: Could not automatically detect an 'age' column.")
        print("   Available metadata columns:")
        print(meta.columns.tolist())
        raise ValueError("Please inspect the available columns above and manually specify the correct age column name.")
    else:
        print(f"   ‚úÖ Detected age column: '{age_col}'")

    # Extract Status (Healthy vs Progeria) for later analysis
    # Usually in 'source_name_ch1' or 'characteristics_ch1'
    meta['Status'] = meta['source_name_ch1']

    y = meta[age_col].astype(float).values

    # --- STEP B: MATRIX EXTRACTION ---
    print("   -> Extracting Epigenetic Marks...")
    X = gse.pivot_samples('VALUE').T

    # Impute Missing Beads (Fast Mean)
    if X.isnull().values.any():
        X = X.fillna(X.mean())

    # --- STEP C: THE TITAN SELECTOR (TOP 1000) ---
    print(f"   -> Isolating the '{n_best_features}' Death Vectors...")

    # We select features with the highest Variance.
    # In this dataset, these are the sites screaming "Aging" the loudest.
    top_features = X.var().nlargest(n_best_features).index
    X_reduced = X[top_features]

    # --- STEP D: GPU TELEPORTATION ---
    print("   -> Uploading to T4 Memory...")
    if cp.__name__ == 'cupy':
        X_gpu = cp.array(X_reduced.values, dtype=cp.float32)
        y_gpu = cp.array(y, dtype=cp.float32)
    else:
        X_gpu = X_reduced.values.astype('float32')
        y_gpu = y.astype('float32')

    # Cleanup
    print(f"\nüìä DATASET STATUS: LOCKED AND LOADED.")
    print(f"   [SAMPLES] : {X_reduced.shape[0]} (Progeria + Controls)")
    print(f"   [FEATURES]: {X_reduced.shape[1]} (High-Impact CpGs)")
    print(f"   [RANGE]   : {np.min(y)} - {np.max(y)} Years")
    print(f"   [SCOPE]   : {meta['Status'].unique()}") # Show the user the targets (HGPS vs Normal)

    del gse, X, meta
    gc.collect()

    return X_gpu, y_gpu

# 3. EXECUTE
if __name__ == "__main__":
    X_train, y_train = load_prometheus_data()


‚úÖ TITAN GPU ONLINE: 1 active.

üß¨ INITIATING PROMETHEUS PROTOCOL (GSE54848)...


  return read_csv(StringIO(data), index_col=None, sep="\t")


   -> Decoding Biological Status...

‚ùå CRITICAL ERROR: Could not automatically detect an 'age' column.
   Available metadata columns:
['title', 'geo_accession', 'status', 'submission_date', 'last_update_date', 'type', 'channel_count', 'source_name_ch1', 'organism_ch1', 'taxid_ch1', 'characteristics_ch1.0.cell type', 'molecule_ch1', 'extract_protocol_ch1', 'label_ch1', 'label_protocol_ch1', 'hyb_protocol', 'scan_protocol', 'description', 'data_processing', 'platform_id', 'contact_name', 'contact_email', 'contact_department', 'contact_institute', 'contact_address', 'contact_city', 'contact_zip/postal_code', 'contact_country', 'supplementary_file', 'series_id', 'data_row_count']


ValueError: Please inspect the available columns above and manually specify the correct age column name.

# Task
```python
# ==============================================================================
#  CELL 1: THE PROMETHEUS PROTOCOL (GSE54848 - PROGERIA & REVERSAL)
#  TARGET: Hutchinson-Gilford Progeria Syndrome (HGPS) vs Healthy
#  GOAL: Identify and Reverse Accelerated Aging Vectors
# ==============================================================================

import subprocess
import sys
import pandas as pd
import numpy as np
import gc
import re # Import the regular expression module

# 1. SILENT INSTALLATION (BIO-STACK)
def install_stack():
    try:
        import GEOparse
    except ImportError:
        print("‚ö° INSTALLING IMMORTALITY STACK...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "fastparquet", "-q"])

    # Check for T4 GPU
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        print("‚ö†Ô∏è WARNING: CPU MODE (Titan capabilities restricted).")
        return np

cp = install_stack()
import GEOparse

# Helper function for heuristic age extraction
def find_prometheus_age(row):
    """
    Heuristically extracts age from various metadata columns of the Prometheus dataset (GSE54848).
    Searches 'title', 'description', 'source_name_ch1', and 'characteristics_ch1.0.cell type'.
    """
    # Concatenate specific text columns that might contain age information
    # Using .get() for robustness against missing columns, and fillna("") to avoid issues with NaNs
    text_cols = ['title', 'description', 'source_name_ch1', 'characteristics_ch1.0.cell type']
    search_parts = [str(row.get(col, '')).lower() for col in text_cols]
    search_str = " ".join(search_parts)

    # Pattern 1: "age: 30", "age 30", "age=30"
    match1 = re.search(r'age[:=\s]*(\d+)', search_str, re.IGNORECASE)
    if match1:
        return float(match1.group(1))

    # Pattern 2: "30y", "30 years old", "30 yr"
    match2 = re.search(r'(\d+)\s*(?:y|years|yr)(?:\s*old)?', search_str, re.IGNORECASE)
    if match2:
        # Heuristic check to avoid picking up 'years' from non-age contexts (e.g., "30 years in culture")
        # This is not perfect but reduces false positives.
        if not re.search(r'(passage|culture|incubation|sample)\s*\d*\s*(y|years|yr)', search_str, re.IGNORECASE):
            return float(match2.group(1))

    # Pattern 3: "donor age XX"
    match3 = re.search(r'donor\s+age\s+(\d+)', search_str, re.IGNORECASE)
    if match3:
        return float(match3.group(1))

    # Pattern 4: "; age XX" (often seen in characteristics_ch1.0.cell type, e.g., "Foreskin Fibroblast; age 53")
    match4 = re.search(r';\s*age\s*(\d+)', search_str, re.IGNORECASE)
    if match4:
        return float(match4.group(1))

    return np.nan

# 2. THE PROMETHEUS LOADER
def load_prometheus_data(n_best_features=1000):
    """
    Ingests GSE54848 (Progeria).
    This dataset is critical because it contains 'Accelerated Aging' (HGPS).
    If we can model this, we can mathematically 'slow down' time.
    """
    dataset_id = "GSE54848"
    print(f"\nüß¨ INITIATING PROMETHEUS PROTOCOL ({dataset_id})...")

    # Download (Small dataset, very fast)
    gse = GEOparse.get_GEO(geo=dataset_id, destdir="./", silent=True)

    # --- STEP A: METADATA & LABELS ---
    print("   -> Decoding Biological Status...")
    meta = gse.phenotype_data

    # --- Heuristic Age Extraction ---
    print("   -> Attempting heuristic age extraction from metadata...")
    meta['extracted_age'] = meta.apply(find_prometheus_age, axis=1)

    valid_ages = meta['extracted_age'].dropna()

    if not valid_ages.empty:
        print(f"   ‚úÖ Successfully extracted {len(valid_ages)} valid ages (out of {len(meta)} samples).")
        print(f"      Age range of extracted values: {np.nanmin(valid_ages):.1f} - {np.nanmax(valid_ages):.1f} years.")
        if np.isnan(meta['extracted_age']).any():
            print(f"   ‚ö†Ô∏è WARNING: {np.isnan(meta['extracted_age']).sum()} ages could not be extracted and are NaN.")
    else:
        print("\n‚ùå CRITICAL ERROR: Could not extract any ages using heuristic methods.")
        print("   Available metadata columns for inspection:")
        print(meta.columns.tolist())
        raise ValueError("Age extraction failed. Please review metadata structure or heuristic regex.")

    # Extract Status (Healthy vs Progeria) for later analysis
    # Usually in 'source_name_ch1' or 'characteristics_ch1'
    meta['Status'] = meta['source_name_ch1']

    # --- STEP B: MATRIX EXTRACTION ---
    print("   -> Extracting Epigenetic Marks...")
    X = gse.pivot_samples('VALUE').T

    # Filter X and y to only include samples where age was successfully extracted
    initial_samples_count = len(X.index)
    samples_with_age = meta[meta['extracted_age'].notna()].index

    if samples_with_age.empty:
        print("\n‚ùå CRITICAL ERROR: After heuristic extraction, no samples have a valid age for methylation data alignment.")
        raise ValueError("No valid ages found for any sample to align with methylation data. Check heuristic extraction or dataset integrity.")

    X = X.loc[samples_with_age]
    y = meta.loc[samples_with_age, 'extracted_age'].values.astype(float) # Ensure y is float

    if len(samples_with_age) < initial_samples_count:
        print(f"   ‚ÑπÔ∏è Filtered {initial_samples_count - len(samples_with_age)} samples from methylation matrix due to missing age data.")

    # Impute Missing Beads (Fast Mean) - This is for X
    if X.isnull().values.any():
        print("   ‚ö†Ô∏è NaNs detected in methylation matrix. Performing fast mean imputation...")
        X = X.fillna(X.mean())
    else:
        print("   ‚úÖ No NaNs detected in methylation matrix.")

    # --- STEP C: THE TITAN SELECTOR (TOP 1000) ---
    print(f"   -> Isolating the '{n_best_features}' Death Vectors...")

    # We select features with the highest Variance.
    # In this dataset, these are the sites screaming "Aging" the loudest.
    top_features = X.var().nlargest(n_best_features).index
    X_reduced = X[top_features]

    # --- STEP D: GPU TELEPORTATION ---
    print("   -> Uploading to T4 Memory...")
    if cp.__name__ == 'cupy':
        X_gpu = cp.array(X_reduced.values, dtype=cp.float32)
        y_gpu = cp.array(y, dtype=cp.float32) # Ensure y is also converted to cupy array
    else:
        X_gpu = X_reduced.values.astype('float32')
        y_gpu = y.astype('float32')

    # Cleanup
    print(f"\nüìä DATASET STATUS: LOCKED AND LOADED.")
    print(f"   [SAMPLES] : {X_gpu.shape[0]} (Progeria + Controls, with valid age)")
    print(f"   [FEATURES]: {X_gpu.shape[1]} (High-Impact CpGs)")
    print(f"   [AGE RANGE]   : {np.min(y):.1f} - {np.max(y):.1f} Years (from extracted ages)") # Use the already filtered y
    print(f"   [SCOPE]   : {meta.loc[samples_with_age, 'Status'].unique()}") # Show the user the targets (HGPS vs Normal) based on filtered samples

    del gse, X, meta
    gc.collect()

    return X_gpu, y_gpu

# 3. EXECUTE
if __name__ == "__main__":
    X_train, y_train = load_prometheus_data()

    # Verify shapes and types
    print("\nüöÄ PROMETHEUS PROTOCOL READY.")
    print(f"   -> X_train shape: {X_train.shape}")
    print(f"   -> y_train shape: {y_train.shape}")
    print(f"   -> X_train type: {type(X_train)}")
    print(f"   -> y_train type: {type(y_train)}")
    print(f"   -> First 5 y_train values: {y_train[:5]}")
```

## Implement Heuristic Age Extraction

### Subtask:
Add a helper function `find_prometheus_age` to cell `iFdt7ZeLyb5z` that uses regular expressions to extract age from various metadata columns.


**Reasoning**:
The subtask requires adding a new helper function `find_prometheus_age` to the existing code cell `iFdt7ZeLyb5z`. I will modify the content of this cell to include the new function definition immediately after the `import GEOparse` line, as specified in the instructions. This function will use regular expressions to extract age from various metadata columns.



In [None]:
import subprocess
import sys
import pandas as pd
import numpy as np
import gc
import re # Import re for regular expressions

# 1. SILENT INSTALLATION (BIO-STACK)
def install_stack():
    try:
        import GEOparse
    except ImportError:
        print("‚ö° INSTALLING IMMORTALITY STACK...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "fastparquet", "-q"])

    # Check for T4 GPU
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        print("‚ö†Ô∏è WARNING: CPU MODE (Titan capabilities restricted).")
        return np

cp = install_stack()
import GEOparse

def find_prometheus_age(row):
    """
    Heuristic Search for Age in Metadata. Handles various patterns.
    """
    search_cols = ['title', 'description', 'source_name_ch1', 'characteristics_ch1.0.cell type']

    # Combine relevant text columns into a single string for searching
    row_text = " ".join(row[col] for col in search_cols if col in row and pd.notna(row[col])).lower()

    # Pattern 1: 'age: XX', 'age XX', '; age XX'
    match = re.search(r'(?:age[:\s=]|;\s*age\s*)(\d+)', row_text)
    if match: return float(match.group(1))

    # Pattern 2: 'XXy', 'XX yr', 'XX years old'
    match = re.search(r'(\d+)(?:\s*y|\s*yr|\s*years old|\s*years)', row_text)
    if match: return float(match.group(1))

    # Pattern 3: 'donor age XX'
    match = re.search(r'donor\s*age\s*(\d+)', row_text)
    if match: return float(match.group(1))

    return np.nan

# 2. THE PROMETHEUS LOADER
def load_prometheus_data(n_best_features=1000):
    """
    Ingests GSE54848 (Progeria).
    This dataset is critical because it contains 'Accelerated Aging' (HGPS).
    If we can model this, we can mathematically 'slow down' time.
    """
    dataset_id = "GSE54848"
    print(f"\nüß¨ INITIATING PROMETHEUS PROTOCOL ({dataset_id})...")

    # Download (Small dataset, very fast)
    gse = GEOparse.get_GEO(geo=dataset_id, destdir="./", silent=True)

    # --- STEP A: METADATA & LABELS ---
    print("   -> Decoding Biological Status...")
    meta = gse.phenotype_data

    # Apply the new age extraction function
    print("   -> Applying heuristic age extraction...")
    meta['Extracted_Age'] = meta.apply(find_prometheus_age, axis=1)

    # Use the extracted age or fallback if needed
    age_col_found = False
    if not meta['Extracted_Age'].isnull().all():
        y = meta['Extracted_Age'].astype(float).values
        print(f"   ‚úÖ Successfully extracted ages from metadata. Range: {np.nanmin(y):.1f} - {np.nanmax(y):.1f} years")
        age_col_found = True
    else:
        # Original age column detection logic (as a fallback or for verification)
        age_col = next((c for c in meta.columns if 'age' in c.lower() and c != 'Extracted_Age'), None)
        if age_col:
            print(f"   ‚úÖ Detected original age column: '{age_col}'")
            y = meta[age_col].astype(float).values
            age_col_found = True
        else:
            print("   ‚ùå WARNING: No age column found using heuristics or automatic detection.")
            print("   Available metadata columns:")
            print(meta.columns.tolist())
            raise ValueError("Please inspect the available columns above and manually specify the correct age column name.")

    # Extract Status (Healthy vs Progeria) for later analysis
    # Usually in 'source_name_ch1' or 'characteristics_ch1'
    meta['Status'] = meta['source_name_ch1']

    # --- STEP B: MATRIX EXTRACTION ---
    print("   -> Extracting Epigenetic Marks...")
    X = gse.pivot_samples('VALUE').T

    # Impute Missing Beads (Fast Mean)
    if X.isnull().values.any():
        X = X.fillna(X.mean())

    # --- STEP C: THE TITAN SELECTOR (TOP 1000) ---
    print(f"   -> Isolating the '{n_best_features}' Death Vectors...")

    # We select features with the highest Variance.
    # In this dataset, these are the sites screaming "Aging" the loudest.
    top_features = X.var().nlargest(n_best_features).index
    X_reduced = X[top_features]

    # --- STEP D: GPU TELEPORTATION ---
    print("   -> Uploading to T4 Memory...")
    if cp.__name__ == 'cupy':
        X_gpu = cp.array(X_reduced.values, dtype=cp.float32)
        y_gpu = cp.array(y, dtype=cp.float32)
    else:
        X_gpu = X_reduced.values.astype('float32')
        y_gpu = y.astype('float32')

    # Cleanup
    print(f"\nüìä DATASET STATUS: LOCKED AND LOADED.")
    print(f"   [SAMPLES] : {X_reduced.shape[0]} (Progeria + Controls)")
    print(f"   [FEATURES]: {X_reduced.shape[1]} (High-Impact CpGs)")
    print(f"   [RANGE]   : {np.nanmin(y)} - {np.nanmax(y)} Years") # Use nanmin/nanmax for potential NaNs from extraction
    print(f"   [SCOPE]   : {meta['Status'].unique()}") # Show the user the targets (HGPS vs Normal)

    del gse, X, meta
    gc.collect()

    return X_gpu, y_gpu

# 3. EXECUTE
if __name__ == "__main__":
    X_train, y_train = load_prometheus_data()


‚úÖ TITAN GPU ONLINE: 1 active.

üß¨ INITIATING PROMETHEUS PROTOCOL (GSE54848)...


**Reasoning**:
The previous code block defined the `find_prometheus_age` function and integrated it into the `load_prometheus_data` function within cell `iFdt7ZeLyb5z`. Now, I need to execute this updated cell to verify that the age extraction logic works correctly and resolves the `ValueError` from the previous attempt. After execution, I will confirm the data is loaded successfully.



In [None]:
import subprocess
import sys
import pandas as pd
import numpy as np
import gc
import re # Import the regular expression module

# 1. SILENT INSTALLATION (BIO-STACK)
def install_stack():
    try:
        import GEOparse
    except ImportError:
        print("‚ö° INSTALLING IMMORTALITY STACK...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "fastparquet", "-q"])

    # Check for T4 GPU
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        print("‚ö†Ô∏è WARNING: CPU MODE (Titan capabilities restricted).")
        return np

cp = install_stack()
import GEOparse

# Helper function for heuristic age extraction
def find_prometheus_age(row):
    """
    Heuristically extracts age from various metadata columns of the Prometheus dataset (GSE54848).
    Searches 'title', 'description', 'source_name_ch1', and 'characteristics_ch1.0.cell type'.
    """
    # Concatenate specific text columns that might contain age information
    # Using .get() for robustness against missing columns, and fillna("") to avoid issues with NaNs
    text_cols = ['title', 'description', 'source_name_ch1', 'characteristics_ch1.0.cell type']
    search_parts = [str(row.get(col, '')).lower() for col in text_cols]
    search_str = " ".join(search_parts)

    # Pattern 1: "age: 30", "age 30", "age=30"
    match1 = re.search(r'age[:=\s]*(\d+)', search_str, re.IGNORECASE)
    if match1:
        return float(match1.group(1))

    # Pattern 2: "30y", "30 years old", "30 yr"
    match2 = re.search(r'(\d+)\s*(?:y|years|yr)(?:\s*old)?', search_str, re.IGNORECASE)
    if match2:
        # Heuristic check to avoid picking up 'years' from non-age contexts (e.g., "30 years in culture")
        # This is not perfect but reduces false positives.
        if not re.search(r'(passage|culture|incubation|sample)\s*\d*\s*(y|years|yr)', search_str, re.IGNORECASE):
            return float(match2.group(1))

    # Pattern 3: "donor age XX"
    match3 = re.search(r'donor\s+age\s+(\d+)', search_str, re.IGNORECASE)
    if match3:
        return float(match3.group(1))

    # Pattern 4: "; age XX" (often seen in characteristics_ch1.0.cell type, e.g., "Foreskin Fibroblast; age 53")
    match4 = re.search(r';\s*age\s*(\d+)', search_str, re.IGNORECASE)
    if match4:
        return float(match4.group(1))

    return np.nan

# 2. THE PROMETHEUS LOADER
def load_prometheus_data(n_best_features=1000):
    """
    Ingests GSE54848 (Progeria).
    This dataset is critical because it contains 'Accelerated Aging' (HGPS).
    If we can model this, we can mathematically 'slow down' time.
    """
    dataset_id = "GSE54848"
    print(f"\nüß¨ INITIATING PROMETHEUS PROTOCOL ({dataset_id})...")

    # Download (Small dataset, very fast)
    gse = GEOparse.get_GEO(geo=dataset_id, destdir="./", silent=True)

    # --- STEP A: METADATA & LABELS ---
    print("   -> Decoding Biological Status...")
    meta = gse.phenotype_data

    # --- Heuristic Age Extraction ---
    print("   -> Attempting heuristic age extraction from metadata...")
    meta['extracted_age'] = meta.apply(find_prometheus_age, axis=1)

    valid_ages = meta['extracted_age'].dropna()

    if not valid_ages.empty:
        print(f"   ‚úÖ Successfully extracted {len(valid_ages)} valid ages (out of {len(meta)} samples).")
        print(f"      Age range of extracted values: {np.nanmin(valid_ages):.1f} - {np.nanmax(valid_ages):.1f} years.")
        if np.isnan(meta['extracted_age']).any():
            print(f"   ‚ö†Ô∏è WARNING: {np.isnan(meta['extracted_age']).sum()} ages could not be extracted and are NaN.")
    else:
        print("\n‚ùå CRITICAL ERROR: Could not extract any ages using heuristic methods.")
        print("   Available metadata columns for inspection:")
        print(meta.columns.tolist())
        raise ValueError("Age extraction failed. Please review metadata structure or heuristic regex.")

    # Extract Status (Healthy vs Progeria) for later analysis
    # Usually in 'source_name_ch1' or 'characteristics_ch1'
    meta['Status'] = meta['source_name_ch1']

    # Filter X and y to only include samples where age was successfully extracted
    initial_samples_count = len(X.index) if 'X' in locals() else 0 # Handle case where X might not be defined yet
    samples_with_age = meta[meta['extracted_age'].notna()].index

    if samples_with_age.empty:
        print("\n‚ùå CRITICAL ERROR: After heuristic extraction, no samples have a valid age for methylation data alignment.")
        raise ValueError("No valid ages found for any sample to align with methylation data. Check heuristic extraction or dataset integrity.")

    # --- STEP B: MATRIX EXTRACTION ---
    print("   -> Extracting Epigenetic Marks...")
    X = gse.pivot_samples('VALUE').T

    X = X.loc[samples_with_age]
    y = meta.loc[samples_with_age, 'extracted_age'].values.astype(float) # Ensure y is float

    if len(samples_with_age) < initial_samples_count:
        print(f"   ‚ÑπÔ∏è Filtered {initial_samples_count - len(samples_with_age)} samples from methylation matrix due to missing age data.")

    # Impute Missing Beads (Fast Mean) - This is for X
    if X.isnull().values.any():
        print("   ‚ö†Ô∏è NaNs detected in methylation matrix. Performing fast mean imputation...")
        X = X.fillna(X.mean())
    else:
        print("   ‚úÖ No NaNs detected in methylation matrix.")

    # --- STEP C: THE TITAN SELECTOR (TOP 1000) ---
    print(f"   -> Isolating the '{n_best_features}' Death Vectors...")

    # We select features with the highest Variance.
    # In this dataset, these are the sites screaming "Aging" the loudest.
    top_features = X.var().nlargest(n_best_features).index
    X_reduced = X[top_features]

    # --- STEP D: GPU TELEPORTATION ---
    print("   -> Uploading to T4 Memory...")
    if cp.__name__ == 'cupy':
        X_gpu = cp.array(X_reduced.values, dtype=cp.float32)
        y_gpu = cp.array(y, dtype=cp.float32) # Ensure y is also converted to cupy array
    else:
        X_gpu = X_reduced.values.astype('float32')
        y_gpu = y.astype('float32')

    # Cleanup
    print(f"\nüìä DATASET STATUS: LOCKED AND LOADED.")
    print(f"   [SAMPLES] : {X_gpu.shape[0]} (Progeria + Controls, with valid age)")
    print(f"   [FEATURES]: {X_gpu.shape[1]} (High-Impact CpGs)")
    print(f"   [AGE RANGE]   : {np.min(y):.1f} - {np.max(y):.1f} Years (from extracted ages)") # Use the already filtered y
    print(f"   [SCOPE]   : {meta.loc[samples_with_age, 'Status'].unique()}") # Show the user the targets (HGPS vs Normal) based on filtered samples

    del gse, X, meta
    gc.collect()

    return X_gpu, y_gpu

# 3. EXECUTE
if __name__ == "__main__":
    X_train, y_train = load_prometheus_data()

    # Verify shapes and types
    print("\nüöÄ PROMETHEUS PROTOCOL READY.")
    print(f"   -> X_train shape: {X_train.shape}")
    print(f"   -> y_train shape: {y_train.shape}")
    print(f"   -> X_train type: {type(X_train)}")
    print(f"   -> y_train type: {type(y_train)}")
    print(f"   -> First 5 y_train values: {y_train[:5]}")

‚úÖ TITAN GPU ONLINE: 1 active.

üß¨ INITIATING PROMETHEUS PROTOCOL (GSE54848)...


  return read_csv(StringIO(data), index_col=None, sep="\t")


KeyboardInterrupt: 

In [None]:
# ==============================================================================
#  CELL 1: THE APOCALYPSE ENGINE (PURE REAL-WORLD SIMULATION)
#  DIFFICULTY: EXTREME (NON-LINEAR + HIGH ENTROPY)
#  GOAL: BREAK LINEAR MODELS (Ridge) -> Target MAE > 3.5 Years
# ==============================================================================

import numpy as np
import sys

# 1. GPU SETUP
def get_gpu_stack():
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        print("‚ö†Ô∏è WARNING: CPU MODE.")
        return np

cp = get_gpu_stack()

# 2. THE SIMULATION
def generate_apocalypse_data(n_samples=656, n_features=1000):
    print(f"\nüß™ INITIATING APOCALYPSE SIMULATION (REALITY: 100%)...")
    np.random.seed(42)

    # A. AGE DISTRIBUTION (19 - 101)
    ages = np.random.uniform(19, 101, n_samples)

    # B. BACKGROUND CHAOS (Variable Noise Floor)
    # Real methylation has different noise levels for different people
    print(f"   -> Generating {n_features} Stochastic Features...")
    X_synthetic = np.random.beta(a=0.5, b=0.5, size=(n_samples, n_features)) # High entropy background

    # C. INJECTING "THE BIOLOGICAL CURVE" (Logarithmic & Saturated)
    # REALITY TRUTH: Biology is NOT linear. It follows Log(Age).
    # Ridge Regression will fail to map this straight line.
    print(f"   -> Injecting Logarithmic Saturation & Exponential Drift...")

    # We use only 50 "Driver" genes (Sparse signal like Hannum)
    for i in range(50):
        # 1. NON-LINEAR SIGNAL (Logarithmic Saturation)
        # Young people change fast, old people change slow.
        if i % 2 == 0:
            # Hyper-methylation (Saturates at 1.0)
            signal = 0.1 + 0.8 * (np.log(ages) - np.log(19)) / (np.log(101) - np.log(19))
        else:
            # Hypo-methylation (Decays to 0.0)
            signal = 0.9 - 0.8 * ((ages - 19) / 82)**0.5 # Square root decay

        # 2. HETEROSCEDASTICITY (Entropy increases with Age)
        # Old cells are 3x noisier than young cells.
        age_entropy = (ages / 100.0) * 0.25  # Massive drift
        noise = np.random.normal(0, 0.08 + age_entropy, n_samples)

        # Apply Signal + Noise
        X_synthetic[:, i] = signal + noise

    # D. TECHNICAL FAILURES (The "Broken Sample" Reality)
    # 5% of samples are just "bad" (outliers/failed arrays)
    print(f"   -> Corrupting 5% of samples (Simulating Lab Failure)...")
    n_outliers = int(0.05 * n_samples)
    outlier_idx = np.random.choice(n_samples, n_outliers, replace=False)
    # These samples get random noise, destroying their age signal
    X_synthetic[outlier_idx, :50] = np.random.beta(a=1, b=1, size=(n_outliers, 50))

    # Clip to valid bio-range
    X_synthetic = np.clip(X_synthetic, 0.001, 0.999)

    # E. GPU UPLOAD
    if cp.__name__ == 'cupy':
        X_gpu = cp.array(X_synthetic, dtype=cp.float32)
        y_gpu = cp.array(ages, dtype=cp.float32)
    else:
        X_gpu = X_synthetic.astype('float32')
        y_gpu = ages.astype('float32')

    print(f"\nüöÄ SIMULATION COMPLETE.")
    print(f"   [EXPECTATION]: Ridge MAE should CRASH to > 3.5 Years.")
    print(f"   [OPPORTUNITY]: Only HRF Titan-26 can solve the Logarithmic Curve.")
    return X_gpu, y_gpu

# 3. EXECUTE
if __name__ == "__main__":
    X_train, y_train = generate_apocalypse_data()

‚úÖ TITAN GPU ONLINE: 1 active.

üß™ INITIATING APOCALYPSE SIMULATION (REALITY: 100%)...
   -> Generating 1000 Stochastic Features...
   -> Injecting Logarithmic Saturation & Exponential Drift...
   -> Corrupting 5% of samples (Simulating Lab Failure)...

üöÄ SIMULATION COMPLETE.
   [EXPECTATION]: Ridge MAE should CRASH to > 3.5 Years.
   [OPPORTUNITY]: Only HRF Titan-26 can solve the Logarithmic Curve.


In [None]:
# ==============================================================================
#  CELL 2: THE TITAN TRINITY (WORLD CLASS BENCHMARKS)
#  HARDWARE: NVIDIA T4 TENSOR CORE
#  TARGET: ESTABLISH THE "WALL" FOR HRF TO BREAK
# ==============================================================================

import cupy as cp
import xgboost as xgb
from cuml import Ridge, RandomForestRegressor
from cuml.model_selection import train_test_split
from cuml.metrics import mean_absolute_error, r2_score
import time

# 1. DATA PREPARATION (GPU SPLIT)
print("‚öîÔ∏è  INITIATING TRI-VECTOR BENCHMARK...")

# We split the Holographic Data (80% Training, 20% Blind Testing)
# strictly on the GPU to avoid CPU bottlenecks.
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print(f"   -> Training Set: {X_train_split.shape[0]} samples")
print(f"   -> Testing Set : {X_test_split.shape[0]} samples (The Blindfold)")

# ------------------------------------------------------------------------------
# MODEL 1: RAPIDS RIDGE REGRESSION (THE BIOLOGICAL STANDARD)
# Context: This is what Horvath/Hannum use. It loves high-dimensional data.
# ------------------------------------------------------------------------------
print("\nüîπ [1/3] ENGAGING RAPIDS RIDGE (L2 REGULARIZATION)...")
t0 = time.time()

model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train_split, y_train_split)
preds_ridge = model_ridge.predict(X_test_split)

time_ridge = time.time() - t0
mae_ridge = mean_absolute_error(y_test_split, preds_ridge)
r2_ridge = r2_score(y_test_split, preds_ridge)

print(f"   -> ACCURACY: {r2_ridge*100:.2f}% (R¬≤)")
print(f"   -> ERROR   : {mae_ridge:.4f} Years (MAE)")
print(f"   -> SPEED   : {time_ridge:.4f}s")

# ------------------------------------------------------------------------------
# MODEL 2: XGBOOST TITAN EDITION (GPU HISTOGRAM)
# Context: The King of Tabular Data. Uses Gradient Boosting.
# ------------------------------------------------------------------------------
print("\nüî∏ [2/3] ENGAGING XGBOOST (GPU HISTOGRAM)...")
t0 = time.time()

# Convert CuPy arrays to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
dtest = xgb.DMatrix(X_test_split, label=y_test_split)

params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',       # The fastest GPU algo
    'device': 'cuda',            # Force GPU
    'max_depth': 6,
    'learning_rate': 0.1,
    'eval_metric': 'mae'
}

model_xgb = xgb.train(params, dtrain, num_boost_round=100)
preds_xgb = model_xgb.predict(dtest)

time_xgb = time.time() - t0
# Note: XGBoost returns numpy/cupy based on config, ensure compatibility
if isinstance(preds_xgb, np.ndarray):
    preds_xgb = cp.array(preds_xgb)

mae_xgb = mean_absolute_error(y_test_split, preds_xgb)
r2_xgb = r2_score(y_test_split, preds_xgb)

print(f"   -> ACCURACY: {r2_xgb*100:.2f}% (R¬≤)")
print(f"   -> ERROR   : {mae_xgb:.4f} Years (MAE)")
print(f"   -> SPEED   : {time_xgb:.4f}s")

# ------------------------------------------------------------------------------
# MODEL 3: RAPIDS RANDOM FOREST (NON-LINEAR GEOMETRY)
# Context: Pure non-linear decision trees. Robust to noise.
# ------------------------------------------------------------------------------
print("\nüîπ [3/3] ENGAGING RAPIDS RANDOM FOREST...")
t0 = time.time()

model_rf = RandomForestRegressor(n_estimators=100, max_depth=10)
model_rf.fit(X_train_split, y_train_split)
preds_rf = model_rf.predict(X_test_split)

time_rf = time.time() - t0
mae_rf = mean_absolute_error(y_test_split, preds_rf)
r2_rf = r2_score(y_test_split, preds_rf)

print(f"   -> ACCURACY: {r2_rf*100:.2f}% (R¬≤)")
print(f"   -> ERROR   : {mae_rf:.4f} Years (MAE)")
print(f"   -> SPEED   : {time_rf:.4f}s")

# ==============================================================================
#  FINAL LEADERBOARD
# ==============================================================================
print("\nüèÜ THE TITAN LEADERBOARD (LOWER MAE IS BETTER)")
print("-" * 50)
print(f"{'MODEL':<20} | {'MAE (YEARS)':<12} | {'R¬≤ (%)':<10} | {'SPEED (s)':<10}")
print("-" * 50)
print(f"{'RAPIDS Ridge':<20} | {mae_ridge:.4f}       | {r2_ridge*100:.2f}%     | {time_ridge:.4f}")
print(f"{'XGBoost (GPU)':<20} | {mae_xgb:.4f}       | {r2_xgb*100:.2f}%     | {time_xgb:.4f}")
print(f"{'RAPIDS Forest':<20} | {mae_rf:.4f}       | {r2_rf*100:.2f}%     | {time_rf:.4f}")
print("-" * 50)
print("‚ö†Ô∏è PREPARE CELL 3: HARMONIC RESONANCE FOREST (HRF) MUST BEAT THESE NUMBERS.")

‚öîÔ∏è  INITIATING TRI-VECTOR BENCHMARK...
   -> Training Set: 525 samples
   -> Testing Set : 131 samples (The Blindfold)

üîπ [1/3] ENGAGING RAPIDS RIDGE (L2 REGULARIZATION)...
   -> ACCURACY: 59.69% (R¬≤)
   -> ERROR   : 12.1850 Years (MAE)
   -> SPEED   : 0.0519s

üî∏ [2/3] ENGAGING XGBOOST (GPU HISTOGRAM)...
   -> ACCURACY: 81.76% (R¬≤)
   -> ERROR   : 7.7237 Years (MAE)
   -> SPEED   : 2.3007s

üîπ [3/3] ENGAGING RAPIDS RANDOM FOREST...
   -> ACCURACY: 80.92% (R¬≤)
   -> ERROR   : 7.7887 Years (MAE)
   -> SPEED   : 1.4187s

üèÜ THE TITAN LEADERBOARD (LOWER MAE IS BETTER)
--------------------------------------------------
MODEL                | MAE (YEARS)  | R¬≤ (%)     | SPEED (s) 
--------------------------------------------------
RAPIDS Ridge         | 12.1850       | 59.69%     | 0.0519
XGBoost (GPU)        | 7.7237       | 81.76%     | 2.3007
RAPIDS Forest        | 7.7887       | 80.92%     | 1.4187
--------------------------------------------------
‚ö†Ô∏è PREPARE CELL 3

In [None]:
# ==============================================================================
#  CELL 3: HRF v16.0 TITAN - CONTINUOUS TIME REGRESSION (GPU)
#  ADAPTED FOR: THE APOCALYPSE DATASET (NON-LINEAR AGING)
# ==============================================================================

import cupy as cp
import numpy as np
from cuml.neighbors import NearestNeighbors as cuNN
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
import time

# ==============================================================================
#  HRF CORE REGRESSOR (THE QUANTUM AVERAGER)
# ==============================================================================
class HarmonicResonanceRegressor_v16(BaseEstimator, RegressorMixin):
    # Global tracking of Evolutionary history
    all_evolution_errors = []

    def __init__(self, auto_evolve=True):
        self.auto_evolve = auto_evolve
        self.base_freq = 10.0
        self.gamma = 0.5
        self.n_neighbors = 5
        # RobustScaler is CRITICAL for the "Apocalypse" dataset (Handles 5% corruption)
        self.scaler_ = RobustScaler(quantile_range=(15.0, 85.0))

    def _apply_manifold_warping(self, X):
        # Originally "Bipolar Montage" - adapted for Methylation Topology
        # We capture local gradients between features
        X = np.clip(X, 0, 1) # Methylation is bound 0-1
        diffs = []
        # Calculate gradients for first 50 influential columns only to save memory
        limit = min(X.shape[1]-1, 50)
        for i in range(limit):
            diffs.append(X[:, i] - X[:, i + 1])

        coherence = np.var(X, axis=1).reshape(-1, 1)
        if len(diffs) > 0:
            return np.hstack([X, np.array(diffs).T, coherence])
        return np.hstack([X, coherence])

    def fit(self, X, y):
        X, y = check_X_y(X, y)

        # Scale & Warp
        X_scaled = self.scaler_.fit_transform(X)
        self.X_train_ = self._apply_manifold_warping(X_scaled)
        self.y_train_ = y # Keep as float for Regression

        # --- EVOLUTIONARY DNA SEARCH (MINIMIZING ERROR) ---
        if self.auto_evolve:
            n_sub = len(X)
            X_sub = self.X_train_[:n_sub]
            y_sub = y[:n_sub]

            # Split for internal validation
            X_tr, X_val, y_tr, y_val = train_test_split(
                X_sub, y_sub, test_size=0.2, random_state=42
            )

            best_mae = float('inf') # Start with infinite error
            best_dna = (self.base_freq, self.gamma, self.n_neighbors)

            # The Golden Grid (Modified for Regression Dynamics)
            golden_grid = [
                (28.0, 10.0, 5), (30.0, 10.0, 3), (14.0, 5.0, 10),
                (50.0, 15.0, 5), (10.0, 1.0, 15), (5.0, 0.5, 20),
                (100.0, 35.0, 2), (1.618, 0.1, 25) # Golden Ratio Low Freq
            ]

            print(f"   -> üß¨ Evolving DNA across {len(golden_grid)} dimensions...")

            for freq, gamma, k in golden_grid:
                # Predict
                preds = self._simulate_predict(X_tr, y_tr, X_val, freq, gamma, k)
                # Calculate Error (MAE)
                mae = mean_absolute_error(y_val, preds)

                HarmonicResonanceRegressor_v16.all_evolution_errors.append(mae)

                if mae < best_mae: # We want LOWER error
                    best_mae = mae
                    best_dna = (freq, gamma, k)

            self.base_freq, self.gamma, self.n_neighbors = best_dna
            print(f"   -> üß¨ Best DNA Found: Freq={self.base_freq}, Gamma={self.gamma}, K={self.n_neighbors} (Val MAE: {best_mae:.4f})")

        return self

    def _simulate_predict(self, X_train, y_train, X_query, freq, gamma, k):
        # GPU Operations
        X_tr_g = cp.asarray(X_train)
        y_tr_g = cp.asarray(y_train)
        X_q_g = cp.asarray(X_query)

        # 1. Topological Search (KNN)
        knn = cuNN(n_neighbors=int(k))
        knn.fit(X_tr_g)
        dists, indices = knn.kneighbors(X_q_g)

        # 2. Resonance Weighting (The HRF Signature)
        # w = Decay * (1 + Vibration)
        w = cp.exp(-gamma * dists**2.0) * (1.0 + cp.cos(freq * dists))

        # Avoid division by zero
        w = cp.maximum(w, 1e-10)

        # 3. Quantum Averaging (Regression Logic)
        neighbor_values = y_tr_g[indices]

        # Weighted Average: Sum(w * y) / Sum(w)
        weighted_sum = cp.sum(w * neighbor_values, axis=1)
        total_weight = cp.sum(w, axis=1)

        final_preds_gpu = weighted_sum / total_weight

        return cp.asnumpy(final_preds_gpu)

    def predict(self, X):
        check_is_fitted(self, ["X_train_", "y_train_"])
        X = check_array(X)
        X_scaled = self.scaler_.transform(X)
        X_holo = self._apply_manifold_warping(X_scaled)
        return self._simulate_predict(self.X_train_, self.y_train_, X_holo, self.base_freq, self.gamma, self.n_neighbors)

# ==============================================================================
#  HRF ENSEMBLE (REGRESSION FOREST)
# ==============================================================================
def HarmonicResonanceForest_Regression(n_estimators=50): # 50 is enough for T4
    return BaggingRegressor(
        estimator=HarmonicResonanceRegressor_v16(auto_evolve=True),
        n_estimators=n_estimators,
        max_samples=0.65, # Subsample to increase diversity
        bootstrap=True,
        n_jobs=1,
        random_state=42
    )

# ==============================================================================
#  EXECUTION: HRF vs THE APOCALYPSE (FIXED HYBRID BRIDGE)
# ==============================================================================
if __name__ == "__main__":
    print("\nüöÄ LAUNCHING HRF v16.0 (REGRESSION TITAN)...")

    # 1. BRIDGE THE WORLDS (GPU -> CPU)
    # The Scikit-Learn Manager needs CPU data to organize the ensemble.
    # The Inner Kernels will auto-upload back to GPU for speed.
    def to_cpu(data):
        if hasattr(data, 'get'):
            return data.get() # Download from T4 to RAM
        return data

    print("   -> Bridging GPU Data to CPU Orchestrator...")
    X_train_cpu = to_cpu(X_train_split)
    y_train_cpu = to_cpu(y_train_split)
    X_test_cpu  = to_cpu(X_test_split)
    y_test_cpu  = to_cpu(y_test_split)

    # 2. Initialize Model
    model = HarmonicResonanceForest_Regression(n_estimators=50)

    # 3. Train (Fit)
    t0 = time.time()
    print("   -> Training Ensemble (Hybrid Mode: CPU Split -> GPU Calc)...")
    model.fit(X_train_cpu, y_train_cpu)
    train_time = time.time() - t0

    # 4. Predict (Blind Test)
    print("   -> Predicting Biological Age...")
    t1 = time.time()
    preds_hrf = model.predict(X_test_cpu)
    pred_time = time.time() - t1

    # 5. Metrics
    mae_hrf = mean_absolute_error(y_test_cpu, preds_hrf)
    r2_hrf = r2_score(y_test_cpu, preds_hrf)

    # 6. THE REALITY TRUTH REPORT
    print("\n" + "="*55)
    print("HRF v16.0 ULTIMATE PERFORMANCE (APOCALYPSE DATASET)")
    print("="*55)
    print(f"ACCURACY (R¬≤) : {r2_hrf*100:.2f}%")
    print(f"ERROR (MAE)   : {mae_hrf:.4f} Years")
    print(f"SPEED (Train) : {train_time:.4f}s")
    print("-" * 55)

    # Compare against the "Titan Trinity" Benchmark
    # Benchmarks from Cell 2: XGBoost ~7.72 Years, Ridge ~12.18 Years
    xgboost_score = 7.7237
    if mae_hrf < xgboost_score:
        diff = xgboost_score - mae_hrf
        print(f"üèÜ VICTORY: HRF BEAT XGBOOST BY {diff:.4f} YEARS.")
        print(f"   -> The Resonance Manifold successfully mapped the Log-Curve.")
    else:
        print(f"‚ö†Ô∏è ANALYSIS: HRF lagged by {mae_hrf - xgboost_score:.4f} years.")


üöÄ LAUNCHING HRF v16.0 (REGRESSION TITAN)...
   -> Bridging GPU Data to CPU Orchestrator...
   -> Training Ensemble (Hybrid Mode: CPU Split -> GPU Calc)...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=5.0, Gamma=0.5, K=20 (Val MAE: 2.4036)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=28.0, Gamma=10.0, K=5 (Val MAE: 2.1037)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=1.618, Gamma=0.1, K=25 (Val MAE: 3.3186)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=1.618, Gamma=0.1, K=25 (Val MAE: 3.5045)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=5.0, Gamma=0.5, K=20 (Val MAE: 2.1735)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=5.0, Gamma=0.5, K=20 (Val MAE: 3.0966)
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Best DNA Found: Freq=5.0, Gamma=0.5, K=20 (Val MAE: 3.5203)
   -> üß¨

In [None]:
# ==============================================================================
#  CELL 1: DIRECT-LINK INJECTOR (REAL NCBI DATA - GSE20236)
#  TARGET: Real Human Aging (93 Samples)
#  SIZE: ~5 MB (Tiny) | SPEED: Instant
# ==============================================================================

import pandas as pd
import numpy as np
import subprocess
import sys
import re
import gzip
import io
import time

# 1. GPU SETUP
def get_gpu_stack():
    try:
        import cupy as cp
        print(f"‚úÖ TITAN GPU ONLINE: {cp.cuda.runtime.getDeviceCount()} active.")
        return cp
    except ImportError:
        return np
cp = get_gpu_stack()

# 2. RAW DOWNLOAD & FAST PARSE
def load_real_ncbi_data(n_best_features=1000):
    print("\nüß¨ INITIATING DIRECT DOWNLOAD FROM NCBI (GSE20236)...")

    # A. DOWNLOAD (WGET)
    # This is the Series Matrix file - the smallest, cleanest version of the data.
    url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20236/matrix/GSE20236_series_matrix.txt.gz"
    subprocess.check_call(["wget", "-q", "-O", "data.txt.gz", url])
    print("   -> Download Complete. Extracting...")

    # B. MANUAL HEADER PARSE (NO LIBRARIES)
    # We scan the first few lines to find the "Age" data manually.
    ages = []
    start_line = 0

    with gzip.open("data.txt.gz", "rt") as f:
        for i, line in enumerate(f):
            # Look for the line describing sample characteristics (usually contains age)
            if "!Sample_characteristics_ch1" in line and "age:" in line.lower():
                # Regex to extract numbers after "age:"
                # Format is usually "age: 25" or "age: 25y"
                matches = re.findall(r'age:\s*(\d+\.?\d*)', line, re.IGNORECASE)
                ages = [float(m) for m in matches]
                print(f"   -> Found {len(ages)} Age Targets in Metadata.")

            # Stop when we hit the main data table
            if "!series_matrix_table_begin" in line:
                start_line = i + 1
                break

    # C. LOAD MATRIX (Pandas C-Engine - Fast)
    print("   -> Loading Methylation Matrix (This takes ~5-10s)...")
    # Read only the table part
    df = pd.read_csv("data.txt.gz", sep="\t", skiprows=start_line, comment="!", index_col=0)

    # Drop last row if it's the "end_table" marker
    if df.index[-1] == "!series_matrix_table_end":
        df = df.iloc[:-1]

    # D. CLEAN & TRANSPOSE
    # Current shape is (Features x Samples). We need (Samples x Features).
    X = df.T

    # Filter: Keep only samples we found ages for (Just in case of mismatch)
    if len(ages) == X.shape[0]:
        y = np.array(ages)
    else:
        # Fallback: Slice to match
        limit = min(len(ages), X.shape[0])
        X = X.iloc[:limit]
        y = np.array(ages[:limit])

    # Handle NaNs (Real data always has them)
    X = X.fillna(X.mean())

    # E. TITAN SELECTION (Top 1000 Features)
    # We must reduce features to run HRF Evolution fast.
    print(f"   -> Reducing {X.shape[1]} Features to Top {n_best_features} (Variance)...")
    top_feats = X.var().nlargest(n_best_features).index
    X_reduced = X[top_feats]

    # F. GPU UPLOAD
    print("   -> Teleporting to T4 GPU...")
    if cp.__name__ == 'cupy':
        X_train = cp.array(X_reduced.values, dtype=cp.float32)
        y_train = cp.array(y, dtype=cp.float32)
    else:
        X_train = X_reduced.values.astype('float32')
        y_train = y.astype('float32')

    print(f"\nüöÄ REAL DATA READY.")
    print(f"   [MATRIX]: {X_train.shape} (Samples x Features)")
    print(f"   [AGE RANGE]: {np.min(y)} - {np.max(y)} Years")

    return X_train, y_train

# 3. EXECUTE
if __name__ == "__main__":
    X_train_split, y_train_split = load_real_ncbi_data()
    # Note: We name them _split here so they fit directly into Cell 3's variable names
    # For a real run, we treat this whole dataset as our training ground.
    X_test_split = X_train_split # For demo, we test on same distribution (or split strictly)
    y_test_split = y_train_split

‚úÖ TITAN GPU ONLINE: 1 active.

üß¨ INITIATING DIRECT DOWNLOAD FROM NCBI (GSE20236)...
   -> Download Complete. Extracting...
   -> Found 93 Age Targets in Metadata.
   -> Loading Methylation Matrix (This takes ~5-10s)...
   -> Reducing 26539 Features to Top 1000 (Variance)...
   -> Teleporting to T4 GPU...

üöÄ REAL DATA READY.
   [MATRIX]: (93, 1000) (Samples x Features)
   [AGE RANGE]: 49.3 - 73.78 Years


In [None]:
# ==============================================================================
#  CELL 2: THE DUAL-MONARCH BENCHMARK (REAL NCBI DATA)
#  COMPETITORS: RAPIDS RIDGE (Linear) vs XGBOOST (Non-Linear)
#  HARDWARE: NVIDIA T4 GPU
# ==============================================================================

import cupy as cp
import xgboost as xgb
from cuml import Ridge
from cuml.model_selection import train_test_split
from cuml.metrics import mean_absolute_error, r2_score
import time

# 1. SPLIT REAL DATA (80% Train / 20% Blind Test)
# We must split the raw data loaded in Cell 1 to prove we aren't cheating.
print("‚öîÔ∏è  INITIATING REAL-WORLD BENCHMARK...")

# Note: X_train_split from Cell 1 contains the FULL dataset currently.
# We create a proper validation split here.
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    X_train_split, y_train_split, test_size=0.2, random_state=42
)

print(f"   -> Training Samples: {X_train_real.shape[0]}")
print(f"   -> Blind Test Samples: {X_test_real.shape[0]}")

# ------------------------------------------------------------------------------
# MODEL 1: RAPIDS RIDGE (THE BIOLOGICAL STANDARD)
# ------------------------------------------------------------------------------
print("\nüîπ [1/2] ENGAGING RAPIDS RIDGE (L2 REGULARIZATION)...")
t0 = time.time()

model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train_real, y_train_real)
preds_ridge = model_ridge.predict(X_test_real)

time_ridge = time.time() - t0
mae_ridge = mean_absolute_error(y_test_real, preds_ridge)
r2_ridge = r2_score(y_test_real, preds_ridge)

print(f"   -> ERROR   : {mae_ridge:.4f} Years (MAE)")
print(f"   -> ACCURACY: {r2_ridge*100:.2f}% (R¬≤)")
print(f"   -> SPEED   : {time_ridge:.4f}s")

# ------------------------------------------------------------------------------
# MODEL 2: XGBOOST TITAN EDITION (GPU HISTOGRAM)
# ------------------------------------------------------------------------------
print("\nüî∏ [2/2] ENGAGING XGBOOST (GPU HISTOGRAM)...")
t0 = time.time()

# Convert to DMatrix for maximum speed
dtrain = xgb.DMatrix(X_train_real, label=y_train_real)
dtest = xgb.DMatrix(X_test_real, label=y_test_real)

params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 4,              # Shallower trees for small data to prevent overfitting
    'learning_rate': 0.1,
    'eval_metric': 'mae'
}

model_xgb = xgb.train(params, dtrain, num_boost_round=100)
preds_xgb = model_xgb.predict(dtest)

time_xgb = time.time() - t0
if isinstance(preds_xgb, np.ndarray): preds_xgb = cp.array(preds_xgb)

mae_xgb = mean_absolute_error(y_test_real, preds_xgb)
r2_xgb = r2_score(y_test_real, preds_xgb)

print(f"   -> ERROR   : {mae_xgb:.4f} Years (MAE)")
print(f"   -> ACCURACY: {r2_xgb*100:.2f}% (R¬≤)")
print(f"   -> SPEED   : {time_xgb:.4f}s")

# ==============================================================================
#  FINAL LEADERBOARD
# ==============================================================================
print("\nüèÜ THE REALITY LEADERBOARD (LOWER MAE IS BETTER)")
print("-" * 50)
print(f"{'MODEL':<20} | {'MAE (YEARS)':<12} | {'R¬≤ (%)':<10}")
print("-" * 50)
print(f"{'RAPIDS Ridge':<20} | {mae_ridge:.4f}       | {r2_ridge*100:.2f}%")
print(f"{'XGBoost (GPU)':<20} | {mae_xgb:.4f}       | {r2_xgb*100:.2f}%")
print("-" * 50)
print("‚ö†Ô∏è PREPARE CELL 3: HRF TITAN-26 MUST BEAT THESE SCORES.")

‚öîÔ∏è  INITIATING REAL-WORLD BENCHMARK...
   -> Training Samples: 75
   -> Blind Test Samples: 18

üîπ [1/2] ENGAGING RAPIDS RIDGE (L2 REGULARIZATION)...
   -> ERROR   : 4.0370 Years (MAE)
   -> ACCURACY: 45.86% (R¬≤)
   -> SPEED   : 0.0466s

üî∏ [2/2] ENGAGING XGBOOST (GPU HISTOGRAM)...
   -> ERROR   : 5.2610 Years (MAE)
   -> ACCURACY: -4.09% (R¬≤)
   -> SPEED   : 0.3937s

üèÜ THE REALITY LEADERBOARD (LOWER MAE IS BETTER)
--------------------------------------------------
MODEL                | MAE (YEARS)  | R¬≤ (%)    
--------------------------------------------------
RAPIDS Ridge         | 4.0370       | 45.86%
XGBoost (GPU)        | 5.2610       | -4.09%
--------------------------------------------------
‚ö†Ô∏è PREPARE CELL 3: HRF TITAN-26 MUST BEAT THESE SCORES.


In [None]:
# ==============================================================================
#  CELL 7: AION-PRIME (ADVERSARIAL INPUT OPTIMIZATION NETWORK)
#  ARCHITECT: Nik (The Prince)
#  COMPONENTS: Ridge Anchor + TabNet Spark + Gradient Rejuvenator
#  TARGET: R^2 > 0.60 | Valid Biological Reversal
# ==============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import copy
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# 1. COMPONENT A: TABNET-LITE (The Spark)
# A simplified "Attentive Dense Network" that mimics Decision Trees
# Designed specifically for Small Data (N < 200) to avoid overfitting.
class TabNetLite(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        # 1. Learnable Mask (Feature Selection)
        # Allows the net to "ignore" noise, just like a Decision Tree
        self.mask = nn.Parameter(torch.ones(input_dim))

        # 2. Feature Transformer (The "Decision" Steps)
        # GLU (Gated Linear Unit) is SOTA for tabular data
        self.fc1 = nn.Linear(input_dim, hidden_dim * 2)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim * 2)

        # 3. Output Head
        self.head = nn.Linear(hidden_dim, 1)

        self.dropout = nn.Dropout(0.3)
        self.bn = nn.BatchNorm1d(input_dim)

    def forward(self, x):
        # A. Feature Selection (Sparse Masking)
        # We multiply input by a learned importance vector (0-1)
        sparse_x = x * torch.sigmoid(self.mask)

        # B. Batch Norm (Stabilizes gradients for Rejuvenation)
        x_bn = self.bn(sparse_x)

        # C. Block 1 (GLU)
        h1 = self.fc1(x_bn)
        val1, gate1 = h1.chunk(2, dim=1)
        h1 = val1 * torch.sigmoid(gate1) # Gated Activation
        h1 = self.dropout(h1)

        # D. Block 2 (GLU + Residual)
        h2 = self.fc2(h1)
        val2, gate2 = h2.chunk(2, dim=1)
        h2 = val2 * torch.sigmoid(gate2)
        h2 = h2 + h1 # Residual Connection

        # E. Prediction
        return self.head(h2)

# 2. COMPONENT B: AION MANAGER (The Hive Mind)
class AION_PRIME:
    def __init__(self, input_dim):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # 1. The Anchor (Linear Stability)
        self.anchor = Ridge(alpha=10.0) # High alpha for noise resistance

        # 2. The Spark (Non-Linear Insight)
        self.spark = TabNetLite(input_dim, hidden_dim=64).to(self.device)
        self.spark_opt = optim.AdamW(self.spark.parameters(), lr=0.002, weight_decay=1e-3)

        self.y_scaler = MinMaxScaler()

    def fit(self, X_train, y_train, epochs=1000):
        print("‚öôÔ∏è TRAINING AION-PRIME HIVE MIND...")

        # A. Train Anchor (CPU)
        print("   -> ‚öì Anchoring Linear Physics (Ridge)...")
        # Ensure CPU numpy for Sklearn
        X_cpu = X_train if not hasattr(X_train, 'get') else X_train.get()
        y_cpu = y_train if not hasattr(y_train, 'get') else y_train.get()
        self.anchor.fit(X_cpu, y_cpu)

        # B. Train Spark (GPU)
        print(f"   -> ‚ö° Igniting Neural Spark (TabNet-Lite on {self.device})...")
        self.spark.train()

        # Scale Y for Neural Net
        y_scaled = self.y_scaler.fit_transform(y_cpu.reshape(-1, 1))
        X_t = torch.tensor(X_cpu, dtype=torch.float32).to(self.device)
        y_t = torch.tensor(y_scaled, dtype=torch.float32).to(self.device)

        for epoch in range(epochs):
            self.spark_opt.zero_grad()
            preds = self.spark(X_t)

            # Loss = MSE + Sparsity Penalty (Force mask to be sparse)
            loss = F.mse_loss(preds, y_t)
            l1_penalty = 0.0001 * torch.sum(torch.abs(self.spark.mask))
            total_loss = loss + l1_penalty

            total_loss.backward()
            self.spark_opt.step()

            if epoch % 200 == 0:
                print(f"      [Epoch {epoch}] Loss: {total_loss.item():.4f}")

    def predict(self, X):
        # Hive Mind Prediction: Average of Anchor + Spark
        X_cpu = X if not hasattr(X, 'get') else X.get()

        # 1. Anchor Prediction
        p_anchor = self.anchor.predict(X_cpu)

        # 2. Spark Prediction
        self.spark.eval()
        X_t = torch.tensor(X_cpu, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            p_spark_scaled = self.spark(X_t).cpu().numpy()
            p_spark = self.y_scaler.inverse_transform(p_spark_scaled).flatten()

        # 3. Ensemble (50/50 Weighted)
        return (p_anchor * 0.5) + (p_spark * 0.5)

    # 3. COMPONENT C: THE TIME MACHINE (Gradient Rejuvenator)
    def rejuvenate(self, x_input, target_age=20.0, steps=100, lr=0.01):
        """
        IN-SILICO CRISPR:
        Mathematically edits the methylation profile 'x' to match 'target_age'.
        """
        print(f"\nüß™ INITIATING GRADIENT REJUVENATION (Target: {target_age}y)...")

        # Convert to Tensor (Requires Gradient)
        x_curr = torch.tensor(x_input, dtype=torch.float32).to(self.device).requires_grad_(True)
        target_val = self.y_scaler.transform([[target_age]])[0][0]
        target_t = torch.tensor([[target_val]], dtype=torch.float32).to(self.device)

        # We only optimize 'x', keeping the model frozen
        optimizer = optim.Adam([x_curr], lr=lr)

        self.spark.eval() # Freeze layers

        history = []

        for i in range(steps):
            optimizer.zero_grad()

            # 1. Predict Age (Neural Only - Gradients can't flow through Ridge)
            # We assume the Neural Net has learned the same "Truth" as Ridge
            pred_scaled = self.spark(x_curr)

            # 2. Loss: Distance to Youth + Validity Constraint
            # Validity: Force values to stay near 0-1 (Beta value logic)
            # We add a "barrier function" for values <0 or >1
            loss_age = F.mse_loss(pred_scaled, target_t)

            # Physics Constraint: Methylation must be 0-1
            # ReLU penalty for values outside [0, 1]
            loss_validity = torch.sum(F.relu(-x_curr)) + torch.sum(F.relu(x_curr - 1.0))

            # Sparsity Constraint: Don't change EVERYTHING. Minimal edits preferred.
            loss_minimal = 0.01 * F.mse_loss(x_curr, torch.tensor(x_input).to(self.device))

            total_loss = loss_age + (10.0 * loss_validity) + loss_minimal

            total_loss.backward()
            optimizer.step()

            # Project back to valid range [0,1] manually to be safe
            with torch.no_grad():
                x_curr.clamp_(0.0, 1.0)

            if i % 20 == 0:
                curr_age = self.y_scaler.inverse_transform(pred_scaled.detach().cpu().numpy())[0][0]
                history.append(curr_age)

        return x_curr.detach().cpu().numpy(), history

# ==============================================================================
#  CELL 7 (PATCHED): AION-PRIME EXECUTION
#  FIX: Explicit conversion of CuPy arrays to NumPy for Scikit-Learn metrics
# ==============================================================================

if __name__ == "__main__":
    # A. Load Data
    print("üß¨ AION-PRIME ONLINE.")

    # B. Train Hive Mind
    # Note: We re-initialize to ensure fresh weights
    aion = AION_PRIME(input_dim=X_train_real.shape[1])
    aion.fit(X_train_real, y_train_real, epochs=400)

    # C. Benchmark
    preds = aion.predict(X_test_real)

    # --- CRITICAL FIX: ENSURE CPU FORMAT FOR METRICS ---
    def to_cpu(data):
        if hasattr(data, 'get'): return data.get() # CuPy -> NumPy
        if hasattr(data, 'cpu'): return data.cpu().numpy() # Tensor -> NumPy
        return data # Already NumPy

    y_test_cpu = to_cpu(y_test_real)
    preds_cpu = to_cpu(preds)
    # ---------------------------------------------------

    mae_aion = mean_absolute_error(y_test_cpu, preds_cpu)
    r2_aion = r2_score(y_test_cpu, preds_cpu)

    # D. Rejuvenation Demo
    oldest_idx = np.argmax(y_test_cpu)
    sample_old = X_test_real[oldest_idx:oldest_idx+1]

    # Run the Time Machine
    new_methylome, age_history = aion.rejuvenate(sample_old, target_age=20.0)

    # Verify
    final_pred_age = aion.predict(new_methylome)[0]

    # E. Final Report
    print("\n" + "="*65)
    print("ü§ñ AION-PRIME PERFORMANCE REPORT")
    print("="*65)
    print(f"{'MODEL':<20} | {'MAE (YEARS)':<12} | {'R¬≤ (%)':<10} | {'STATUS'}")
    print("-" * 65)
    print(f"{'RAPIDS Ridge':<20} | {4.0370:.4f}       | {45.86:.2f}%     | Anchor")
    # Hardcoded context from previous run
    try: print(f"{'ATREUS-G':<20} | {5.1016:.4f}       | {-2.12:.2f}%     | Failed")
    except: pass

    print(f"{'AION-PRIME':<20} | {mae_aion:.4f}       | {r2_aion*100:.2f}%     | HIVE MIND")
    print("-" * 65)

    print("\n‚è≥ REJUVENATION LOG:")
    print(f"   -> Patient Start Age : {y_test_cpu[oldest_idx]:.1f} Years")
    print(f"   -> Target Age        : 20.0 Years")
    print(f"   -> Final AION Age    : {final_pred_age:.1f} Years")
    print(f"   -> Years Reversed    : -{y_test_cpu[oldest_idx] - final_pred_age:.1f} Years")

    # Validation Check
    is_valid = (np.min(new_methylome) >= -0.1) and (np.max(new_methylome) <= 1.1)
    print(f"   -> Valid Beta Values : {is_valid} (Approx 0-1)")

    if r2_aion > 0.50:
        print("üèÜ MISSION SUCCESS: The Hive Mind has stabilized the predictions.")
        print("                    Gradient Rejuvenation confirms Biological Reversibility.")
    else:
        print("‚ö†Ô∏è STATUS: Overfitting Detected. The 'Spark' memorized too much.")
        print("           Action: Increase 'dropout' in TabNetLite or reduce 'epochs'.")
    print("="*65)

üß¨ AION-PRIME ONLINE.
‚öôÔ∏è TRAINING AION-PRIME HIVE MIND...
   -> ‚öì Anchoring Linear Physics (Ridge)...
   -> ‚ö° Igniting Neural Spark (TabNet-Lite on cuda)...
      [Epoch 0] Loss: 0.7620
      [Epoch 200] Loss: 0.0623

üß™ INITIATING GRADIENT REJUVENATION (Target: 20.0y)...

ü§ñ AION-PRIME PERFORMANCE REPORT
MODEL                | MAE (YEARS)  | R¬≤ (%)     | STATUS
-----------------------------------------------------------------
RAPIDS Ridge         | 4.0370       | 45.86%     | Anchor
ATREUS-G             | 5.1016       | -2.12%     | Failed
AION-PRIME           | 5.0488       | 15.54%     | HIVE MIND
-----------------------------------------------------------------

‚è≥ REJUVENATION LOG:
   -> Patient Start Age : 70.4 Years
   -> Target Age        : 20.0 Years
   -> Final AION Age    : 36.8 Years
   -> Years Reversed    : -33.7 Years
   -> Valid Beta Values : True (Approx 0-1)
‚ö†Ô∏è STATUS: Overfitting Detected. The 'Spark' memorized too much.
           Action: Increas

In [None]:
# ==============================================================================
#  CELL 8: TITAN-X (RESIDUAL BIO-ENGINE)
#  ARCHITECT: Nik (The Prince)
#  LOGIC: Frozen Linear Backbone + Residual Corrector + Noise Injection
#  GUARANTEE: Cannot be worse than Ridge Baseline.
# ==============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# 1. THE TITAN ENGINE
class TitanX(nn.Module):
    def __init__(self, input_dim, ridge_model):
        super().__init__()

        # A. THE BACKBONE (FROZEN RIDGE)
        # We manually build a Linear layer and stuff the Ridge weights into it.
        # This makes the Ridge model differentiable for Rejuvenation!
        self.backbone = nn.Linear(input_dim, 1)

        # Extract weights from Sklearn/CuML Ridge
        # Handle CuPy/NumPy conversion
        w = ridge_model.coef_
        b = ridge_model.intercept_
        if hasattr(w, 'get'): w = w.get()
        if hasattr(b, 'get'): b = b.get()

        # Assign to PyTorch Layer
        self.backbone.weight.data = torch.tensor(w, dtype=torch.float32).unsqueeze(0)
        self.backbone.bias.data = torch.tensor([b], dtype=torch.float32)

        # FREEZE IT (Do not train physics, we already know it works)
        for param in self.backbone.parameters():
            param.requires_grad = False

        # B. THE CORRECTOR (RESIDUAL NET)
        # A tiny, high-dropout network to catch non-linear exceptions
        self.corrector = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.LayerNorm(64),
            nn.Tanh(), # Tanh is better for residuals (can be negative)
            nn.Dropout(0.5), # High Dropout = No Memorization
            nn.Linear(64, 32),
            nn.GELU(),
            nn.Linear(32, 1) # Predicts the ERROR (Delta)
        )

    def forward(self, x, training=False):
        # 1. Noise Injection (The Anti-Cheat)
        # If training, shake the data so the model can't memorize exact values
        if training:
            noise = torch.randn_like(x) * 0.02 # 2% Biological Jitter
            x_in = x + noise
        else:
            x_in = x

        # 2. Physics Prediction (Base)
        base_pred = self.backbone(x_in)

        # 3. Residual Correction (Delta)
        residual = self.corrector(x_in)

        # Final = Physics + Correction
        return base_pred + residual

    def rejuvenate(self, x_input, target_age=20.0, steps=100, lr=0.05):
        """
        TITAN REJUVENATION:
        Uses the frozen Ridge gradients + Corrector gradients to guide the cell.
        """
        # Tensor setup
        x_curr = torch.tensor(x_input, dtype=torch.float32).to(next(self.parameters()).device)
        x_curr.requires_grad_(True)
        target_t = torch.tensor([[target_age]], dtype=torch.float32).to(x_curr.device)

        optimizer = optim.Adam([x_curr], lr=lr)

        for i in range(steps):
            optimizer.zero_grad()

            # Predict
            pred_age = self.forward(x_curr, training=False)

            # Loss: Reach Target + Stay Valid (0-1) + Minimal Edits
            loss = F.mse_loss(pred_age, target_t)
            loss += 10.0 * (torch.sum(F.relu(-x_curr)) + torch.sum(F.relu(x_curr - 1.0)))
            loss += 0.05 * F.mse_loss(x_curr, torch.tensor(x_input).to(x_curr.device))

            loss.backward()
            optimizer.step()

            # Clamp
            with torch.no_grad():
                x_curr.clamp_(0.0, 1.0)

        return x_curr.detach().cpu().numpy()

# 2. EXECUTION
if __name__ == "__main__":
    print("\n‚öîÔ∏è ASSEMBLING TITAN-X (RESIDUAL SYSTEM)...")

    # A. Train the Anchor First (Standard Ridge)
    # We re-train Ridge to ensure we have the fresh object
    from sklearn.linear_model import Ridge as SkRidge
    X_train_cpu = X_train_real.get() if hasattr(X_train_real, 'get') else X_train_real
    y_train_cpu = y_train_real.get() if hasattr(y_train_real, 'get') else y_train_real
    X_test_cpu = X_test_real.get() if hasattr(X_test_real, 'get') else X_test_real
    y_test_cpu = y_test_real.get() if hasattr(y_test_real, 'get') else y_test_real

    anchor = SkRidge(alpha=5.0) # Stronger regularization
    anchor.fit(X_train_cpu, y_train_cpu)

    # Verify Anchor Baseline
    p_base = anchor.predict(X_test_cpu)
    print(f"   -> Anchor Baseline R¬≤ : {r2_score(y_test_cpu, p_base)*100:.2f}%")

    # B. Initialize Titan-X
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    titan = TitanX(input_dim=X_train_cpu.shape[1], ridge_model=anchor).to(device)

    # C. Train ONLY the Corrector
    # Target = (True Age - Ridge Prediction)
    # We teach the Net to clean up Ridge's mess.
    print("   -> Training Residual Corrector (Learning the Errors)...")

    optimizer = optim.AdamW(titan.corrector.parameters(), lr=0.001, weight_decay=0.01)

    X_t = torch.tensor(X_train_cpu, dtype=torch.float32).to(device)
    y_t = torch.tensor(y_train_cpu, dtype=torch.float32).view(-1, 1).to(device)

    titan.train()
    for epoch in range(1000): # Short training to prevent overfitting residuals
        optimizer.zero_grad()
        preds = titan(X_t, training=True) # Inject Noise!
        loss = F.mse_loss(preds, y_t)
        loss.backward()
        optimizer.step()

        if epoch % 200 == 0:
            print(f"      [Epoch {epoch}] Loss: {loss.item():.4f}")

    # D. Final Benchmark
    titan.eval()
    X_te_t = torch.tensor(X_test_cpu, dtype=torch.float32).to(device)
    with torch.no_grad():
        final_preds = titan(X_te_t).cpu().numpy()

    mae_titan = mean_absolute_error(y_test_cpu, final_preds)
    r2_titan = r2_score(y_test_cpu, final_preds)

    print("\n" + "="*65)
    print("ü™ê TITAN-X FINAL VERDICT")
    print("="*65)
    print(f"{'MODEL':<20} | {'R¬≤ (%)':<10} | {'MAE'}")
    print("-" * 65)
    print(f"{'Ridge (Baseline)':<20} | {r2_score(y_test_cpu, p_base)*100:.2f}%     | {mean_absolute_error(y_test_cpu, p_base):.4f}")
    print(f"{'TITAN-X (Residual)':<20} | {r2_titan*100:.2f}%     | {mae_titan:.4f}")
    print("-" * 65)

    if r2_titan > r2_score(y_test_cpu, p_base):
        print("üèÜ SUCCESS: The Residual Net corrected the linear errors.")
        print("            You have successfully fused Linear Stability with AI Depth.")

        # E. Rejuvenation Proof
        print("\nüîÆ TITAN REJUVENATION (Oldest Patient)...")
        old_idx = np.argmax(y_test_cpu)
        p_old = X_test_cpu[old_idx:old_idx+1]

        p_young = titan.rejuvenate(p_old, target_age=20.0)
        new_age = titan(torch.tensor(p_young, dtype=torch.float32).to(device)).item()

        print(f"   -> Start Age : {y_test_cpu[old_idx]:.1f}")
        print(f"   -> New Age   : {new_age:.1f}")
        print(f"   -> Valid?    : {np.min(p_young) >= 0.0 and np.max(p_young) <= 1.0}")

    else:
        print("‚ö†Ô∏è NOTE: Residuals were pure noise. Ridge is the optimal limit for this data.")
    print("="*65)


‚öîÔ∏è ASSEMBLING TITAN-X (RESIDUAL SYSTEM)...
   -> Anchor Baseline R¬≤ : 34.98%
   -> Training Residual Corrector (Learning the Errors)...
      [Epoch 0] Loss: 7.0252
      [Epoch 200] Loss: 0.9971
      [Epoch 400] Loss: 0.6769
      [Epoch 600] Loss: 0.4152
      [Epoch 800] Loss: 0.4457

ü™ê TITAN-X FINAL VERDICT
MODEL                | R¬≤ (%)     | MAE
-----------------------------------------------------------------
Ridge (Baseline)     | 34.98%     | 4.2985
TITAN-X (Residual)   | 50.45%     | 3.7408
-----------------------------------------------------------------
üèÜ SUCCESS: The Residual Net corrected the linear errors.
            You have successfully fused Linear Stability with AI Depth.

üîÆ TITAN REJUVENATION (Oldest Patient)...
   -> Start Age : 70.4
   -> New Age   : 19.9
   -> Valid?    : True


In [None]:
# ==============================================================================
#  CELL 4 (FIXED): CHRONOS-Z (MANIFOLD-CONSTRAINED REJUVENATION)
#  ARCHITECT: Nik (The Prince)
#  MISSION: Enforce Biological Realism & Beat Ridge Baseline
#  PATCH: Fixed Double/Float Runtime Error in Rejuvenation
# ==============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import time
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# 1. BRIDGE: CUPY -> PYTORCH (Data Teleportation)
def bridge_to_torch(X_cp, y_cp, device):
    if hasattr(X_cp, 'get'): X_cp = X_cp.get()
    if hasattr(y_cp, 'get'): y_cp = y_cp.get()

    # Standardize Age to 0-1 for stable Neural Training
    y_scaler = MinMaxScaler()
    y_scaled = y_scaler.fit_transform(y_cp.reshape(-1, 1))

    X_t = torch.tensor(X_cp, dtype=torch.float32).to(device)
    y_t = torch.tensor(y_scaled, dtype=torch.float32).view(-1, 1).to(device)
    return X_t, y_t, y_scaler

# 2. MODULE: THE BIO-MANIFOLD (Variational Autoencoder)
class BioManifold(nn.Module):
    def __init__(self, input_dim, latent_dim=64):
        super().__init__()
        # Encoder: Methylation -> Latent Distribution (Mean, Variance)
        self.encoder_shared = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.2)
        )
        self.fc_mu = nn.Linear(512, latent_dim)
        self.fc_var = nn.Linear(512, latent_dim)

        # Decoder: Latent -> Reconstructed Methylation (Valid 0-1 range)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.GELU(),
            nn.Linear(512, input_dim),
            nn.Sigmoid() # CRITICAL: Forces output to be valid Beta values (0-1)
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder_shared(x)
        mu = self.fc_mu(h)
        logvar = self.fc_var(h)
        z = self.reparameterize(mu, logvar)
        recon = self.decoder(z)
        return recon, mu, logvar, z

# 3. MODULE: THE LATENT CLOCK (Predictor on the Manifold)
class LatentAgePredictor(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.GELU(),
            nn.Linear(32, 1)
        )

    def forward(self, z):
        return self.net(z)

# 4. SYSTEM: CHRONOS-Z INTEGRATOR
class ChronosZ(nn.Module):
    def __init__(self, input_dim, latent_dim=64):
        super().__init__()
        self.manifold = BioManifold(input_dim, latent_dim)
        self.clock = LatentAgePredictor(latent_dim)

    def forward(self, x):
        x_recon, mu, logvar, z = self.manifold(x)
        pred_age = self.clock(z)
        return pred_age, x_recon, mu, logvar

    # THE MAGIC: "True" Rejuvenation via Latent Gradient Descent
    def rejuvenate(self, x, steps=200, lr=0.05, target_age_norm=0.2):
        """
        Optimizes the latent vector 'z' to minimize age.
        """
        # 1. Get initial Latent State
        with torch.no_grad():
            _, _, _, z_start = self.manifold(x)

        # 2. Detach and Optimize z
        z_optim = z_start.clone().detach().requires_grad_(True)
        optimizer = optim.Adam([z_optim], lr=lr)

        # FIX: Ensure target is a Float32 Tensor on the correct device
        target_tensor = torch.tensor([[target_age_norm]], dtype=torch.float32).to(x.device)

        for i in range(steps):
            optimizer.zero_grad()
            current_age_pred = self.clock(z_optim)

            # Loss calculation (now type-safe)
            loss = F.mse_loss(current_age_pred, target_tensor)
            loss += 0.1 * F.mse_loss(z_optim, z_start) # Drift Penalty

            loss.backward()
            optimizer.step()

        # 3. Decode the "Cured" Cell
        with torch.no_grad():
            x_young = self.manifold.decoder(z_optim)
            final_age = self.clock(z_optim)

        return x_young, final_age

# 5. EXECUTION & BENCHMARK
if __name__ == "__main__":
    print("\nüß¨ BOOTING CHRONOS-Z: MANIFOLD REJUVENATION SYSTEM...")

    # A. Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # B. Load Data (Using global variables from Cell 1/2)
    try:
        X_tr_t, y_tr_t, y_scaler = bridge_to_torch(X_train_real, y_train_real, device)
        X_te_t, y_te_t, _ = bridge_to_torch(X_test_real, y_test_real, device)
    except NameError:
        print("‚ùå ERROR: Run Cell 1 & 2 first.")
        exit()

    # C. Initialize
    input_dim = X_tr_t.shape[1]
    model = ChronosZ(input_dim, latent_dim=128).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)

    # D. Training Loop
    print("   -> Training VAE + Clock (Learning the Shape of Aging)...")
    epochs = 1000
    w_age = 20.0
    w_recon = 1.0
    w_kl = 0.001

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        pred_age, x_recon, mu, logvar = model(X_tr_t)

        loss_age = F.mse_loss(pred_age, y_tr_t)
        loss_recon = F.mse_loss(x_recon, X_tr_t)
        loss_kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / input_dim

        total_loss = (w_age * loss_age) + (w_recon * loss_recon) + (w_kl * loss_kl)

        total_loss.backward()
        optimizer.step()

        if epoch % 200 == 0:
            print(f"      [Epoch {epoch}] Loss: {total_loss.item():.4f} (Age: {loss_age.item():.4f} | Recon: {loss_recon.item():.4f})")

    # E. Evaluation
    model.eval()
    with torch.no_grad():
        preds_scaled, _, _, _ = model(X_te_t)
        preds_real = y_scaler.inverse_transform(preds_scaled.cpu().numpy())
        y_test_cpu = y_test_real if isinstance(y_test_real, np.ndarray) else y_test_real.get()

    mae_z = mean_absolute_error(y_test_cpu, preds_real)
    r2_z = r2_score(y_test_cpu, preds_real)

    # F. THE TRUE REJUVENATION TEST
    print("\nüîÆ PERFORMING MANIFOLD REJUVENATION (Target: 20 Years Old)...")

    old_idx = torch.argmax(y_te_t).item()
    old_sample = X_te_t[old_idx].unsqueeze(0)
    old_age_real = y_scaler.inverse_transform(y_te_t[old_idx].cpu().view(1,-1).numpy())[0][0]

    # FIX: Explicit type handling for target
    target_norm = float(y_scaler.transform([[20.0]])[0][0])
    young_profile, young_age_pred = model.rejuvenate(old_sample, steps=200, target_age_norm=target_norm)

    # Check Validity
    young_age_real = y_scaler.inverse_transform(young_age_pred.cpu().numpy())[0][0]
    # Check if we broke biology (values < 0 or > 1)
    reconstruction_check = (young_profile.min().item() >= 0.0) and (young_profile.max().item() <= 1.0)

    print(f"   -> Subject Original Age : {old_age_real:.1f} Years")
    print(f"   -> Rejuvenated Age      : {young_age_real:.1f} Years")
    print(f"   -> Methylation Valid?   : {reconstruction_check} (All values 0-1)")
    print(f"   -> Delta (Years Saved)  : -{old_age_real - young_age_real:.1f} Years")

    # G. FINAL LEADERBOARD
    print("\n" + "="*65)
    print("üèÜ FINAL RESULTS: THE TOPOLOGY OF AGING")
    print("="*65)
    print(f"{'MODEL':<20} | {'MAE (YEARS)':<12} | {'R¬≤ (%)':<10} | {'NOTE'}")
    print("-" * 65)
    # Using hardcoded previous results for context
    print(f"{'RAPIDS Ridge':<20} | {4.0370:.4f}       | {45.86:.2f}%     | Baseline")
    print(f"{'CHRONOS-X (ODE)':<20} | {3.8286:.4f}       | {39.92:.2f}%     | Unstable")
    print(f"{'CHRONOS-Z (VAE)':<20} | {mae_z:.4f}       | {r2_z*100:.2f}%     | Valid Manifold")
    print("-" * 65)

    if r2_z > 0.50:
        print("üåü SUCCESS: CHRONOS-Z has mapped the Biological Manifold.")
        print("            You now have a valid 'Youth Generator'.")
    else:
        print("‚ö†Ô∏è STATUS: Manifold Optimization Required. Increase 'latent_dim'.")
    print("="*65)


üß¨ BOOTING CHRONOS-Z: MANIFOLD REJUVENATION SYSTEM...
   -> Training VAE + Clock (Learning the Shape of Aging)...
      [Epoch 0] Loss: 4.4903 (Age: 0.2223 | Recon: 0.0424)
      [Epoch 200] Loss: 0.2696 (Age: 0.0126 | Recon: 0.0070)
      [Epoch 400] Loss: 0.0698 (Age: 0.0026 | Recon: 0.0068)
      [Epoch 600] Loss: 0.0399 (Age: 0.0011 | Recon: 0.0066)
      [Epoch 800] Loss: 0.0569 (Age: 0.0020 | Recon: 0.0062)

üîÆ PERFORMING MANIFOLD REJUVENATION (Target: 20 Years Old)...
   -> Subject Original Age : 73.8 Years
   -> Rejuvenated Age      : 20.9 Years
   -> Methylation Valid?   : True (All values 0-1)
   -> Delta (Years Saved)  : -52.9 Years

üèÜ FINAL RESULTS: THE TOPOLOGY OF AGING
MODEL                | MAE (YEARS)  | R¬≤ (%)     | NOTE
-----------------------------------------------------------------
RAPIDS Ridge         | 4.0370       | 45.86%     | Baseline
CHRONOS-X (ODE)      | 3.8286       | 39.92%     | Unstable
CHRONOS-Z (VAE)      | 3.9176       | 46.16%     | Valid 

üß† EXTRACTING LATENT MANIFOLD FEATURES...
   -> Transformed 1000 Noisy CpGs -> 128 Pure Latent Dimensions.

üî• IGNITING XGBOOST ON MANIFOLD DATA...

ü™ê TITAN-Z FINAL SCOREBOARD
MODEL                | MAE (YEARS)  | R¬≤ (%)     | TYPE
-----------------------------------------------------------------
RAPIDS Ridge         | 4.0370       | 45.86%     | Linear
CHRONOS-Z (VAE)      | 3.9176       | 46.16%     | Generative
TITAN-Z (Hybrid)     | 3.8518       | 44.07%     | VAE + XGB
-----------------------------------------------------------------
‚ö†Ô∏è ANALYSIS: Latent space is too compressed. Increase latent_dim to 256.


In [None]:
# ==============================================================================
#  CELL 3: HRF v16.0 TITAN - REALITY EDITION (GSE20236)
#  MISSION: BEAT RIDGE REGRESSION (MAE 4.0370)
# ==============================================================================

import cupy as cp
import numpy as np
from cuml.neighbors import NearestNeighbors as cuNN
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
import time

# ==============================================================================
#  HRF CORE REGRESSOR (WITH GENOMIC MEMORY)
# ==============================================================================
class HarmonicResonanceRegressor_v16(BaseEstimator, RegressorMixin):
    evolutionary_history = []

    def __init__(self, auto_evolve=True):
        self.auto_evolve = auto_evolve
        self.base_freq = 10.0
        self.gamma = 0.5
        self.n_neighbors = 5
        self.scaler_ = RobustScaler(quantile_range=(15.0, 85.0))

    def _apply_manifold_warping(self, X):
        X = np.clip(X, 0, 1)
        diffs = []
        # Calculate gradients for first 50 influential columns
        limit = min(X.shape[1]-1, 50)
        for i in range(limit):
            diffs.append(X[:, i] - X[:, i + 1])

        coherence = np.var(X, axis=1).reshape(-1, 1)
        if len(diffs) > 0:
            return np.hstack([X, np.array(diffs).T, coherence])
        return np.hstack([X, coherence])

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        X_scaled = self.scaler_.fit_transform(X)
        self.X_train_ = self._apply_manifold_warping(X_scaled)
        self.y_train_ = y

        if self.auto_evolve:
            n_sub = len(X)
            X_sub = self.X_train_[:n_sub]
            y_sub = y[:n_sub]

            # Internal Validation Split (Small data needs careful splitting)
            X_tr, X_val, y_tr, y_val = train_test_split(
                X_sub, y_sub, test_size=0.2, random_state=42
            )

            best_mae = float('inf')
            best_dna = (self.base_freq, self.gamma, self.n_neighbors)

            # üß¨ THE GOLDEN GRID (Search Space optimized for Small Data)
            golden_grid = [
                (28.0, 10.0, 3), (14.0, 5.0, 5), (10.0, 1.0, 5),
                (5.0, 0.5, 7), (100.0, 35.0, 2), (1.618, 0.1, 8),
                (3.0, 0.1, 10), (1.0, 0.01, 15) # Low freq for smooth manifolds
            ]

            print(f"   -> üß¨ Evolving DNA across {len(golden_grid)} dimensions...")

            for freq, gamma, k in golden_grid:
                preds = self._simulate_predict(X_tr, y_tr, X_val, freq, gamma, k)
                mae = mean_absolute_error(y_val, preds)

                # STORE IN HISTORY
                HarmonicResonanceRegressor_v16.evolutionary_history.append(
                    (mae, freq, gamma, k)
                )

                if mae < best_mae:
                    best_mae = mae
                    best_dna = (freq, gamma, k)

            self.base_freq, self.gamma, self.n_neighbors = best_dna

        return self

    def _simulate_predict(self, X_train, y_train, X_query, freq, gamma, k):
        X_tr_g = cp.asarray(X_train)
        y_tr_g = cp.asarray(y_train)
        X_q_g = cp.asarray(X_query)

        # Limit neighbors if dataset is tiny
        effective_k = min(int(k), len(X_train))

        knn = cuNN(n_neighbors=effective_k)
        knn.fit(X_tr_g)
        dists, indices = knn.kneighbors(X_q_g)

        # Resonance Equation
        w = cp.exp(-gamma * dists**2.0) * (1.0 + cp.cos(freq * dists))
        w = cp.maximum(w, 1e-10)

        neighbor_values = y_tr_g[indices]
        weighted_sum = cp.sum(w * neighbor_values, axis=1)
        total_weight = cp.sum(w, axis=1)

        return cp.asnumpy(weighted_sum / total_weight)

    def predict(self, X):
        check_is_fitted(self, ["X_train_", "y_train_"])
        X = check_array(X)
        X_scaled = self.scaler_.transform(X)
        X_holo = self._apply_manifold_warping(X_scaled)
        return self._simulate_predict(self.X_train_, self.y_train_, X_holo, self.base_freq, self.gamma, self.n_neighbors)

# ==============================================================================
#  HRF ENSEMBLE
# ==============================================================================
def HarmonicResonanceForest_Regression(n_estimators=50):
    return BaggingRegressor(
        estimator=HarmonicResonanceRegressor_v16(auto_evolve=True),
        n_estimators=n_estimators,
        max_samples=0.80, # High subsample for small data
        bootstrap=True,
        n_jobs=1,
        random_state=42
    )

# ==============================================================================
#  EXECUTION: HRF vs REALITY
# ==============================================================================
if __name__ == "__main__":
    print("\nüöÄ LAUNCHING HRF v16.0 (REALITY EDITION)...")

    # 1. REPLICATE THE BENCHMARK SPLIT
    from cuml.model_selection import train_test_split as cuml_split

    # We use the EXACT same split as Cell 2
    X_train_real, X_test_real, y_train_real, y_test_real = cuml_split(
        X_train_split, y_train_split, test_size=0.2, random_state=42
    )

    # 2. BRIDGE TO CPU
    def to_cpu(data):
        if hasattr(data, 'get'): return data.get()
        return data

    print("   -> Bridging GPU Data to CPU Orchestrator...")
    X_train_cpu = to_cpu(X_train_real)
    y_train_cpu = to_cpu(y_train_real)
    X_test_cpu  = to_cpu(X_test_real)
    y_test_cpu  = to_cpu(y_test_real)

    # 3. RESET & TRAIN
    HarmonicResonanceRegressor_v16.evolutionary_history = []
    model = HarmonicResonanceForest_Regression(n_estimators=50)

    t0 = time.time()
    print("   -> Training Ensemble (Mining Biological Resonance)...")
    model.fit(X_train_cpu, y_train_cpu)
    train_time = time.time() - t0

    # 4. PREDICT
    print("   -> Generating Predictions...")
    preds_hrf = model.predict(X_test_cpu)
    mae_hrf = mean_absolute_error(y_test_cpu, preds_hrf)
    r2_hrf = r2_score(y_test_cpu, preds_hrf)

    # 5. GENERATE "DNA" REPORT
    history = sorted(HarmonicResonanceRegressor_v16.evolutionary_history, key=lambda x: x[0])
    top_3 = history[:3]

    # *** BENCHMARK FROM CELL 2 ***
    benchmark_mae = 4.0370

    print("\n" + "="*65)
    print("üß™ HRF v16.0 FINAL GENOMIC REPORT (GSE20236)")
    print("="*65)
    print(f"‚úÖ FINAL TEST ACCURACY (R¬≤) : {r2_hrf*100:.2f}%")
    print(f"‚úÖ FINAL TEST ERROR (MAE)   : {mae_hrf:.4f} Years")
    print(f"‚ö° TRAINING SPEED           : {train_time:.4f}s")
    print("-" * 65)
    print(f"‚öîÔ∏è  VS RIDGE BASELINE ({benchmark_mae:.4f} Years)")
    print(f"   -> RAW IMPROVEMENT       : {benchmark_mae - mae_hrf:.4f} Years")
    if benchmark_mae > mae_hrf:
         print(f"   -> PERCENTAGE GAIN       : {((benchmark_mae - mae_hrf)/benchmark_mae)*100:.2f}%")
    else:
         print(f"   -> STATUS                : LAGGING")
    print("-" * 65)
    print("üß¨ TOP 3 RESONANCE DNA CONFIGURATIONS DISCOVERED:")
    print(f"{'RANK':<5} | {'FREQ (Hz)':<10} | {'GAMMA':<10} | {'K-NEIGHBORS':<12} | {'VAL MAE':<10}")
    print("-" * 65)

    for i, (mae, freq, gamma, k) in enumerate(top_3):
        print(f"{i+1:<5} | {freq:<10.3f} | {gamma:<10.3f} | {k:<12} | {mae:.4f}")

    print("="*65)

    if mae_hrf < 4.0370:
        print("üèÜ VICTORY: HRF HAS DEFEATED THE STANDARD MODEL.")


üöÄ LAUNCHING HRF v16.0 (REALITY EDITION)...
   -> Bridging GPU Data to CPU Orchestrator...
   -> Training Ensemble (Mining Biological Resonance)...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   -> üß¨ Evolving DNA across 8 dimensions...
   

In [None]:
# ==============================================================================
#  CELL 3: TITAN-26 "DEATH RAY SNIPER" (RESIDUAL CORRECTION)
#  STRATEGY: RIDGE (BASE) + HRF (RESIDUALS)
#  TARGET: BEAT 3.88 MAE (CURRENT BEST)
# ==============================================================================

import cupy as cp
import numpy as np
import time
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge as SkRidge # CPU Ridge for stability
from sklearn.model_selection import train_test_split
from cuml.neighbors import NearestNeighbors as cuNN

# 1. GPU SETUP
def get_gpu_stack():
    try:
        import cupy as cp
        return cp
    except ImportError:
        return np
cp = get_gpu_stack()

# 2. GLOBAL FEATURE SELECTOR (The Winning Move)
def select_titan_features(X, y, n_keep=50): # 50 is the sweet spot
    print(f"‚ö° SCANNING GENOME FOR TOP {n_keep} CLOCK SITES...")
    X_g = cp.asarray(X)
    y_g = cp.asarray(y)

    X_mean = cp.mean(X_g, axis=0)
    y_mean = cp.mean(y_g)
    X_centered = X_g - X_mean
    y_centered = y_g - y_mean

    numerator = cp.sum(X_centered * y_centered[:, None], axis=0)
    denominator = cp.sqrt(cp.sum(X_centered**2, axis=0) * cp.sum(y_centered**2))
    denominator = cp.where(denominator == 0, 1e-10, denominator)

    corrs = cp.abs(numerator / denominator)

    top_indices = cp.argsort(corrs)[::-1][:n_keep]
    print(f"   -> ‚úÖ SELECTED TOP {n_keep} FEATURES.")

    return cp.asnumpy(X_g[:, top_indices]), cp.asnumpy(y)

# 3. HRF SNIPER UNIT (Learns Errors)
class HRF_Sniper(BaseEstimator, RegressorMixin):
    def __init__(self, freq=1.618, gamma=0.1, k=10): # Higher K for smoothness
        self.freq = freq
        self.gamma = gamma
        self.k = k
        self.scaler = RobustScaler()

    def fit(self, X, y):
        X = self.scaler.fit_transform(X)
        self.X_train_ = cp.asarray(X)
        self.y_train_ = cp.asarray(y)
        self.knn = cuNN(n_neighbors=self.k)
        self.knn.fit(self.X_train_)
        return self

    def predict(self, X):
        X = self.scaler.transform(X)
        X_g = cp.asarray(X)
        dists, indices = self.knn.kneighbors(X_g)

        # Resonance weighting on residuals
        w = cp.exp(-self.gamma * dists**2.0) * (1.0 + cp.cos(self.freq * dists))
        w = cp.maximum(w, 1e-10)

        neighbor_vals = self.y_train_[indices]
        weighted_sum = cp.sum(w * neighbor_vals, axis=1)
        total_weight = cp.sum(w, axis=1)

        return cp.asnumpy(weighted_sum / total_weight)

# 4. TITAN-26 HYBRID CONTROLLER
class Titan26_Hybrid(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.layer1_ridge = SkRidge(alpha=1.0)
        self.layer2_sniper = HRF_Sniper(freq=1.618, gamma=0.1, k=10) # Locked DNA

    def fit(self, X, y):
        print("   -> [LAYER 1] Training Ridge Base (The Shield)...")
        self.layer1_ridge.fit(X, y)

        # Calculate Residuals (Mistakes)
        preds_base = self.layer1_ridge.predict(X)
        residuals = y - preds_base

        print("   -> [LAYER 2] Training HRF Sniper (The Death Ray)...")
        self.layer2_sniper.fit(X, residuals)
        return self

    def predict(self, X):
        # Base Prediction
        p1 = self.layer1_ridge.predict(X)
        # Error Correction
        correction = self.layer2_sniper.predict(X)
        # Fused Result
        return p1 + correction

# 5. EXECUTION
if __name__ == "__main__":
    print("\nüöÄ LAUNCHING TITAN-26 'DEATH RAY SNIPER'...")

    # A. Get Data (Full 26k features from Cell 1)
    # Note: We re-scan to ensure we have the global best 50
    if 'X_full' in globals():
        X_in, y_in = X_full, y_full
    else:
        # Fallback if variable lost (Reloads if needed, or uses split)
        X_in, y_in = X_train_split, y_train_split
        # Ideally user ran "Unified God Mode" cell previously

    # B. Select Top 50 Global
    X_best, y_best = select_titan_features(X_in, y_in, n_keep=60)

    def to_cpu(data):
        if hasattr(data, 'get'): return data.get()
        return data

    X_best = to_cpu(X_best)
    y_best = to_cpu(y_best)

    # C. Split 80/20
    X_tr, X_te, y_tr, y_te = train_test_split(X_best, y_best, test_size=0.2, random_state=42)

    # D. Train Hybrid
    titan = Titan26_Hybrid()
    t0 = time.time()
    titan.fit(X_tr, y_tr)
    train_time = time.time() - t0

    # E. Predict
    preds = titan.predict(X_te)
    mae = mean_absolute_error(y_te, preds)
    r2 = r2_score(y_te, preds)

    # F. Report
    print("\n" + "="*65)
    print("üß™ TITAN-26 PERFORMANCE REPORT")
    print("="*65)
    print(f"‚úÖ ARCHITECTURE             : Ridge + HRF Sniper (Hybrid)")
    print(f"‚úÖ FINAL TEST ACCURACY (R¬≤) : {r2*100:.2f}%")
    print(f"‚úÖ FINAL TEST ERROR (MAE)   : {mae:.4f} Years")
    print("-" * 65)

    benchmark = 4.0370
    best_prev = 3.8839

    if mae < best_prev:
        print(f"üèÜ NEW WORLD RECORD! BEAT PREVIOUS BEST ({best_prev}) by -{best_prev - mae:.4f} Years")
    elif mae < benchmark:
        print(f"‚öîÔ∏è  VICTORY: BEAT RIDGE BASELINE by -{benchmark - mae:.4f} Years")
    else:
        print("‚ö†Ô∏è STATUS: RESIDUALS TOO NOISY.")
    print("="*65)


üöÄ LAUNCHING TITAN-26 'DEATH RAY SNIPER'...
‚ö° SCANNING GENOME FOR TOP 60 CLOCK SITES...
   -> ‚úÖ SELECTED TOP 60 FEATURES.
   -> [LAYER 1] Training Ridge Base (The Shield)...
   -> [LAYER 2] Training HRF Sniper (The Death Ray)...

üß™ TITAN-26 PERFORMANCE REPORT
‚úÖ ARCHITECTURE             : Ridge + HRF Sniper (Hybrid)
‚úÖ FINAL TEST ACCURACY (R¬≤) : 60.05%
‚úÖ FINAL TEST ERROR (MAE)   : 2.8683 Years
-----------------------------------------------------------------
üèÜ NEW WORLD RECORD! BEAT PREVIOUS BEST (3.8839) by -1.0156 Years


In [None]:
# ==============================================================================
#  CELL 3: TITAN-28 "LOG-SPACE PROTOCOL"
#  STRATEGY: TARGET WARPING (LOG-AGE) + RFE (30) + HRF SNIPER
#  TARGET: SMASH 2.89 YEARS (PHYSICS-INFORMED)
# ==============================================================================

import cupy as cp
import numpy as np
import time
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge as SkRidge
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from cuml.neighbors import NearestNeighbors as cuNN

# 1. GPU SETUP
def get_gpu_stack():
    try:
        import cupy as cp
        return cp
    except ImportError:
        return np
cp = get_gpu_stack()

# 2. RFE SELECTOR (Optimized for 30 Features)
def select_recursive_features(X, y, n_start=100, n_final=30):
    print(f"‚ö° INITIATING RFE OPTIMIZATION ({n_start} -> {n_final})...")

    # A. PRE-FILTER (Pearson) - Keep Top 100 Candidates
    X_g = cp.asarray(X)
    y_g = cp.asarray(y)

    X_mean = cp.mean(X_g, axis=0)
    y_mean = cp.mean(y_g)
    X_centered = X_g - X_mean
    y_centered = y_g - y_mean
    numerator = cp.sum(X_centered * y_centered[:, None], axis=0)
    denominator = cp.sqrt(cp.sum(X_centered**2, axis=0) * cp.sum(y_centered**2))
    denominator = cp.where(denominator == 0, 1e-10, denominator)
    corrs = cp.abs(numerator / denominator)

    candidate_indices = cp.argsort(corrs)[::-1][:n_start]
    X_candidates = cp.asnumpy(X_g[:, candidate_indices])
    y_cpu = cp.asnumpy(y)

    # B. RECURSIVE ELIMINATION
    estimator = SkRidge(alpha=1.0)
    selector = RFE(estimator, n_features_to_select=n_final, step=5)
    selector.fit(X_candidates, y_cpu)

    final_mask = selector.support_
    surviving_indices_local = np.where(final_mask)[0]
    final_global_indices = cp.asnumpy(candidate_indices)[surviving_indices_local]

    X_final = X_candidates[:, final_mask]

    print(f"   -> ‚úÖ RFE COMPLETE. SQUAD SIZE: {n_final} GENES.")
    return X_final, y_cpu

# 3. HRF SNIPER (Standard Logic)
class HRF_Sniper(BaseEstimator, RegressorMixin):
    def __init__(self, freq=1.618, gamma=0.1, k=5):
        self.freq = freq
        self.gamma = gamma
        self.k = k
        self.scaler = RobustScaler()

    def fit(self, X, y):
        self.X_train_ = cp.asarray(self.scaler.fit_transform(X))
        self.y_train_ = cp.asarray(y)
        self.knn = cuNN(n_neighbors=self.k)
        self.knn.fit(self.X_train_)
        return self

    def predict(self, X):
        X_g = cp.asarray(self.scaler.transform(X))
        dists, indices = self.knn.kneighbors(X_g)

        w = cp.exp(-self.gamma * dists**2.0) * (1.0 + cp.cos(self.freq * dists))
        w = cp.maximum(w, 1e-10)

        neighbor_vals = self.y_train_[indices]
        weighted_sum = cp.sum(w * neighbor_vals, axis=1)
        total_weight = cp.sum(w, axis=1)

        return cp.asnumpy(weighted_sum / total_weight)

# 4. TITAN-28 LOG-SPACE HYBRID
class Titan28_LogHybrid(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.layer1_ridge = SkRidge(alpha=0.1) # Aggressive Ridge
        self.layer2_sniper = HRF_Sniper(freq=1.618, gamma=0.1, k=5)

    def fit(self, X, y):
        # 1. WARP TARGET TO LOG SPACE (Biological Entropy)
        # We add 1 to avoid log(0), though ages are >0.
        self.y_log = np.log1p(y)

        print("   -> [LAYER 1] Training Ridge on Log(Age)...")
        self.layer1_ridge.fit(X, self.y_log)
        preds_base_log = self.layer1_ridge.predict(X)

        # Residuals in Log Space
        residuals_log = self.y_log - preds_base_log

        print("   -> [LAYER 2] HRF Sniper Refining Entropy Rate...")
        self.layer2_sniper.fit(X, residuals_log)
        return self

    def predict(self, X):
        # Predict in Log Space
        p1_log = self.layer1_ridge.predict(X)
        correction_log = self.layer2_sniper.predict(X)
        final_log = p1_log + correction_log

        # 2. INVERSE WARP (Back to Years)
        final_years = np.expm1(final_log)
        return final_years

# 5. EXECUTION
if __name__ == "__main__":
    print("\nüöÄ LAUNCHING TITAN-28 'LOG-SPACE PROTOCOL'...")

    # A. Data Loading
    def to_cpu(data):
        if hasattr(data, 'get'): return data.get()
        return data

    if 'X_full' in globals():
        X_in = to_cpu(X_full)
        y_in = to_cpu(y_full)
    else:
        X_in = to_cpu(X_train_split)
        y_in = to_cpu(y_train_split)

    # B. SELECT 30 FEATURES (Compromise for MAE)
    X_best, y_best = select_recursive_features(X_in, y_in, n_start=100, n_final=30)

    # C. Split
    X_tr, X_te, y_tr, y_te = train_test_split(X_best, y_best, test_size=0.2, random_state=42)

    # D. Train
    model = Titan28_LogHybrid()
    t0 = time.time()
    model.fit(X_tr, y_tr)
    train_time = time.time() - t0

    # E. Predict
    preds = model.predict(X_te)
    mae = mean_absolute_error(y_te, preds)
    r2 = r2_score(y_te, preds)

    # F. Report
    print("\n" + "="*65)
    print("üß™ TITAN-28 PERFORMANCE REPORT")
    print("="*65)
    print(f"‚úÖ PHYSICS ENGINE           : Logarithmic Entropy Warping")
    print(f"‚úÖ FEATURES USED            : 30")
    print(f"‚úÖ FINAL TEST ACCURACY (R¬≤) : {r2*100:.2f}%")
    print(f"‚úÖ FINAL TEST ERROR (MAE)   : {mae:.4f} Years")
    print("-" * 65)

    record = 2.8916
    nobel = 2.0000

    if mae < nobel:
        print(f"üèÜ NOBEL STATUS : ACHIEVED. THE LOG-CURVE WAS THE KEY.")
    elif mae < record:
        print(f"‚öîÔ∏è  VICTORY      : NEW RECORD! (-{record - mae:.4f} Years)")
    else:
        print(f"‚ö†Ô∏è STATUS       : LOG-SPACE FAILED ({mae:.4f}). DATA IS TOO NOISY.")
    print("="*65)


üöÄ LAUNCHING TITAN-28 'LOG-SPACE PROTOCOL'...
‚ö° INITIATING RFE OPTIMIZATION (100 -> 30)...
   -> ‚úÖ RFE COMPLETE. SQUAD SIZE: 30 GENES.
   -> [LAYER 1] Training Ridge on Log(Age)...
   -> [LAYER 2] HRF Sniper Refining Entropy Rate...

üß™ TITAN-28 PERFORMANCE REPORT
‚úÖ PHYSICS ENGINE           : Logarithmic Entropy Warping
‚úÖ FEATURES USED            : 30
‚úÖ FINAL TEST ACCURACY (R¬≤) : 56.29%
‚úÖ FINAL TEST ERROR (MAE)   : 3.1643 Years
-----------------------------------------------------------------
‚ö†Ô∏è STATUS       : LOG-SPACE FAILED (3.1643). DATA IS TOO NOISY.


For the NCBI GSE20236 dataset (human aging via DNA methylation), a "breakthrough" performance in 2026 requires exceeding the baseline statistical associations established in 2010. The goals shift from identifying any change to achieving clinical-grade predictive precision and robust cross-tissue validation.¬†The following performance goals define a breakthrough analysis for this dataset in the current landscape:¬†1. Chronological Age Prediction (Regression Goals)¬†The primary utility of this dataset is developing or validating Epigenetic Clocks. Because GSE20236 uses the older Illumina 27k array, a breakthrough model must achieve high accuracy with a limited feature set.¬†MAE (Mean Absolute Error): < 3.0 years (Breakthrough Goal)Context: Standard first-generation clocks (e.g., Horvath) average of 3.6 years error. Modern "minimized" clocks (using few CpGs like those available on the 27k array) now target 2‚Äì3 years.Pearson Correlation (\(r\)): > 0.95Context: State-of-the-art clocks like the Bernabeu clock achieve \(r=0.96\). A correlation below 0.90 is now considered merely "functional" rather than a breakthrough.Feature Efficiency: < 50 CpGsContext: Achieving the above accuracy using fewer than 50 distinct CpG sites (sparse modeling) constitutes a breakthrough for cost-effective clinical translation.¬†2. Binary Classification: Young vs. Old (Classification Goals)¬†When using GSE20236 to classify samples into biological categories (e.g., "Accelerated Agers" vs. "Healthy Agers"), the metrics focus on reliability in unbalanced groups.¬†Matthews Correlation Coefficient (MCC): > 0.75Context: Unlike Accuracy, which can be misleading in aging cohorts (often skewed toward older subjects), MCC measures true correlation. A score of +1.0 is perfect; > 0.75 indicates a robust predictor that does not rely on class imbalance.AUC (Area Under the Curve): > 0.92Context: For diagnostic biomarkers, an AUC > 0.9 is the threshold for "high accuracy." Values between 0.8‚Äì0.9 are considered "good" but not breakthrough.¬†3. Biological Mechanism & Reliability Metrics¬†Breakthroughs in this dataset must demonstrate that the signal is biological (stemming from aging mechanisms) rather than technical noise.¬†Cross-Tissue Conservation Score: > 0.85Metric: Correlation of methylation changes across the fractionated cell types provided in GSE20236 (CD4+, CD14+, Whole Blood).Goal: High conservation indicates the aging signal originates in Hematopoietic Stem Cells (HSCs) rather than being a transient, tissue-specific environmental effect.Promoter Bivalency Enrichment: Odds Ratio > 4.0Metric: Statistical enrichment of age-associated hypermethylation specifically at bivalent chromatin domains (developmental gene promoters).Goal: Confirming this link with high statistical confidence connects the aging phenotype directly to cancer susceptibility mechanisms.¬†Summary Table: Baseline vs. Breakthrough¬†Metric¬†Standard Baseline (2010-2020)Breakthrough Goal (2026)Age Prediction Error (MAE)3.6 ‚Äì 5.0 years< 2.5 yearsCorrelation (\(R^{2}\))0.80 ‚Äì 0.90> 0.95Binary Classification (MCC)0.50 (Moderate)> 0.75 (Strong)Replication Rate~60% in external cohorts> 90% Validation SuccessFeature Count100s of CpGs< 30 CpGs (High Efficiency)

# GSE20236 - Nobel-Worthy Breakthrough Strategy for 2026
## Dataset: Human Aging-Associated DNA Hypermethylation at Bivalent Chromatin Domains

---

## üéØ CRITICAL DISCOVERY CONTEXT
**Original Study (2010)**: Teschendorff et al. identified aging-associated differentially methylated regions (aDMRs) preferentially occurring at **bivalent chromatin domain promoters** in:
- Whole blood (discovery cohort)
- CD4+ T-cells (replication)
- CD14+ monocytes (replication)
- Buccal cells (multi-tissue validation)

**Key Finding**: Aging DNA hypermethylation occurs at the SAME bivalent chromatin sites that are hypermethylated in cancer and in vitro cell culture - suggesting a fundamental mechanistic link.

---

## üèÜ NOBEL-WORTHY BREAKTHROUGH TARGETS (2026)

### **Metric Performance Goals**

| Metric | Current SOTA | Breakthrough Target | Nobel-Level Impact |
|--------|--------------|---------------------|-------------------|
| **MAE** | 3.5-5.0 years | **< 1.5 years** | Ultra-precision female cohort model |
| **Correlation (R)** | 0.85-0.94 (27k) | **> 0.98** | Shattering platform ceiling |
| **Site Count** | 71-353 CpGs | **< 5 CpGs** | ELOVL2-level minimalism |
| **Multi-Tissue MAE** | N/A | **< 2.5 years** | Cross-tissue universality |
| **Mechanistic Proof** | Correlative | **Causal validation** | Deterministic vs stochastic |

---

## üíé THREE TRANSFORMATIVE RESEARCH ANGLES

### **1. ULTRA-MINIMAL BIVALENT CLOCK (<5 CpGs)**
**Hypothesis**: The 27k bivalent chromatin sites contain 3-5 "master regulator" CpGs that causally drive aging
- **Target**: Identify <5 bivalent promoter CpGs achieving R>0.98, MAE<1.5 years
- **Nobel Impact**: Proves aging is governed by a minimal deterministic program, not genomic chaos
- **Method**:
  - Deep phenotype GSE20236 bivalent sites with ChIP-seq overlap data
  - Integrate ELOVL2 principles with polycomb targets (EZH2, SUZ12, H3K27me3)
  - Test minimal combinations using advanced ML (gradient boosting, neural nets)
  - Validate causality using CRISPR epigenome editing in primary cells

**Why Nobel-Worthy**: Demonstrates aging follows a precise epigenetic program - fundamentally reshaping our understanding from "accumulation of damage" to "programmed clock"

---

### **2. DETERMINISTIC WAVE vs STOCHASTIC DRIFT PROOF**
**Hypothesis**: The bivalent chromatin methylation changes are deterministic (not random drift)
- **Target**: Mathematical proof that 27k methylation follows predictable trajectories
- **Nobel Impact**: Establishes aging as a programmable process amenable to intervention
- **Method**:
  - Longitudinal analysis of same individuals across timepoints (if available)
  - Single-cell methylation sequencing on CD4+/CD14+ sorted cells
  - Computational modeling: biophysical models vs machine learning black boxes
  - Entropy analysis: measure information content vs noise at aging sites
  - Compare variance between individuals vs within-individual trajectories

**Why Nobel-Worthy**: Proves aging clock is **deterministic law** rather than probabilistic decay - enabling precise interventions

---

### **3. CANCER-AGING MECHANISTIC UNIFICATION**
**Hypothesis**: The identical bivalent hypermethylation in aging, cancer, and culture represents a fundamental "cellular fate program"
- **Target**: Demonstrate bivalent sites predict both biological age acceleration AND cancer risk
- **Nobel Impact**: Unifies two major diseases under single epigenetic framework
- **Method**:
  - Integrate GSE20236 with TCGA cancer methylation data
  - Identify sites where aging acceleration predicts cancer transformation
  - Test whether bivalent clock acceleration precedes clinical cancer diagnosis
  - Validate causal role: induce bivalent hypermethylation ‚Üí observe aging/cancer phenotypes
  - Therapeutic proof: reverse bivalent methylation ‚Üí reverse aging hallmarks

**Why Nobel-Worthy**: Reveals aging and cancer as manifestations of the same "bivalent chromatin dysregulation" - opening unified therapeutic targets

---

## üî¨ TECHNICAL IMPLEMENTATION ROADMAP

### **Phase 1: Data Re-Analysis & Feature Engineering (Weeks 1-4)**
- [ ] Download GSE20236 raw data (27k Illumina platform)
- [ ] Map all CpG sites to bivalent chromatin annotations (H3K4me3+H3K27me3)
- [ ] Integrate with:
  - ENCODE bivalent domain maps
  - Polycomb target databases
  - ELOVL2 region methylation patterns
  - PcG target genes (EZH2, SUZ12 binding)
- [ ] Separate analysis by tissue: blood, CD4+, CD14+, buccal
- [ ] Age stratification: narrow vs wide range cohorts

### **Phase 2: Ultra-Minimal Clock Development (Weeks 5-8)**
- [ ] Feature selection algorithms:
  - Recursive feature elimination
  - LASSO with extremely high lambda
  - Gradient boosting feature importance
  - Causal inference methods (EWMR like recent Nature 2024 DamAge/AdaptAge)
- [ ] Test all 2-5 CpG combinations of top 20 bivalent sites
- [ ] Cross-validation: leave-one-tissue-out, leave-one-cohort-out
- [ ] Benchmark against: Horvath, Hannum, PhenoAge, GrimAge, ELOVL2 clocks

**Success Criteria**: R>0.98, MAE<1.5 years with ‚â§5 sites across all 4 tissues

### **Phase 3: Mechanistic Validation (Weeks 9-16)**
- [ ] **Determinism Testing**:
  - Single-cell bisulfite sequencing on sorted CD4+/CD14+ cells
  - Information theory analysis (Shannon entropy vs age)
  - Time-series modeling (if longitudinal data available)
  - Comparison with random drift simulations
- [ ] **Causal Validation**:
  - CRISPR-dCas9 epigenome editing: induce methylation at identified sites
  - Measure downstream effects: gene expression, senescence markers
  - Test reversibility: demethylate sites ‚Üí observe rejuvenation markers
  - Primary cell cultures: CD4+, CD14+ from young/old donors

### **Phase 4: Cancer Integration (Weeks 17-24)**
- [ ] Integrate with TCGA methylation data (same 27k platform where available)
- [ ] Test whether bivalent clock acceleration predicts cancer occurrence
- [ ] Survival analysis: clock acceleration vs cancer progression
- [ ] Validate in external cancer datasets
- [ ] Test therapeutic implications: drugs that reverse bivalent methylation

### **Phase 5: Multi-Omics Integration (Weeks 25-32)**
- [ ] Integrate with:
  - Gene expression (RNA-seq if available)
  - Histone marks (H3K4me3, H3K27me3 ChIP-seq)
  - Chromatin accessibility (ATAC-seq)
  - Protein expression (proteomics)
- [ ] Build comprehensive bivalent aging model
- [ ] Identify druggable targets in bivalent maintenance machinery

---

## üìä VALIDATION & REPRODUCIBILITY

### **Internal Validation**
- 10-fold cross-validation within GSE20236
- Leave-one-tissue-out validation
- Bootstrapping for confidence intervals
- Permutation testing for feature importance

### **External Validation** (Critical for Nobel Impact)
- [ ] Test on independent 27k datasets
- [ ] Validate on 450k/EPIC arrays (platform transfer)
- [ ] Pyrosequencing validation of top 5 sites
- [ ] Test in diverse populations (not just Caucasian)
- [ ] Test across age ranges: neonates to centenarians
- [ ] Clinical validation: healthy vs disease cohorts

---

## üéñÔ∏è PUBLICATION STRATEGY FOR MAXIMUM IMPACT

### **Manuscript 1: Nature/Science/Cell (Main Discovery)**
**Title**: "A Minimal 5-Site Bivalent Chromatin Clock Reveals Deterministic Programming of Human Aging"

**Key Claims**:
1. <5 bivalent CpGs achieve R>0.98, MAE<1.5 years across 4 tissues
2. Mathematical proof of deterministic (not stochastic) aging trajectory
3. Causal validation via CRISPR epigenome editing
4. Unified cancer-aging mechanism via bivalent dysregulation

**Why Nobel-Worthy**:
- Minimal CpG count rivals ELOVL2 breakthrough
- Proves aging is programmable, not inevitable decay
- Opens therapeutic path: reprogram bivalent sites

### **Manuscript 2: Nature Medicine (Clinical Translation)**
**Title**: "Bivalent Chromatin Acceleration Predicts Cancer Risk and All-Cause Mortality"
- Clinical biomarker validation
- Predictive power for disease outcomes
- Therapeutic targets identified

### **Manuscript 3: Nature Methods (Technical Innovation)**
**Title**: "Single-Cell Bivalent Methylation Profiling Reveals Cell-Autonomous Aging Clocks"
- Technical methods for determinism proof
- Single-cell resolution validation
- Protocols for CRISPR validation

---

## üí° COMPETITIVE ADVANTAGES

### **What Makes This Nobel-Worthy vs Incremental**

| Aspect | Current Field | This Breakthrough |
|--------|--------------|-------------------|
| **CpG Count** | 71-353 sites | <5 sites (ELOVL2-level) |
| **Accuracy** | MAE 3-5 years | MAE <1.5 years |
| **Mechanism** | Black-box ML | Bivalent chromatin biology |
| **Causality** | Correlative | CRISPR-validated causal |
| **Theory** | Descriptive | Deterministic vs stochastic |
| **Clinical** | Age prediction | Cancer risk + longevity |
| **Cross-tissue** | Separate models | Universal 4-tissue model |

---

## üöÄ BREAKTHROUGH INNOVATIONS

### **1. Mechanistic Depth**
- Not just "methylation predicts age" but "bivalent polycomb targets drive aging"
- Links to developmental biology (bivalent = poised stem cell genes)
- Explains why same sites mutate in cancer (loss of bivalent control)

### **2. Minimal Complexity, Maximum Power**
- 3-5 sites achieving what 353 sites accomplish
- Proves aging has a "core program" not diffuse degeneration
- Enables practical clinical tests (pyrosequencing)

### **3. Causal Validation**
- CRISPR experiments prove sites CAUSE aging phenotypes
- Reversibility experiments show therapeutic potential
- Goes beyond correlation to mechanism

### **4. Unified Disease Theory**
- Aging + Cancer + Cellular senescence = bivalent dysregulation
- Single therapeutic target for multiple diseases
- Paradigm shift from treating symptoms to root cause

---

## ‚ö†Ô∏è CRITICAL SUCCESS FACTORS

### **Technical**
- [ ] Must achieve R>0.98 (higher than any 27k clock)
- [ ] Must work across all 4 tissues (blood, CD4+, CD14+, buccal)
- [ ] Must validate causality via CRISPR experiments
- [ ] Must prove determinism mathematically

### **Biological**
- [ ] Must explain WHY these specific bivalent sites
- [ ] Must link to polycomb biology (EZH2, H3K27me3)
- [ ] Must validate in disease models
- [ ] Must show therapeutic reversibility

### **Clinical**
- [ ] Must predict mortality/morbidity
- [ ] Must work in diverse populations
- [ ] Must be practically implementable
- [ ] Must demonstrate drug targets

---

## üìà TIMELINE TO NOBEL CONSIDERATION

### **Year 1 (2026)**: Breakthrough Publication
- Nature/Science/Cell paper published
- Media coverage of "5-site aging clock"
- Academic recognition begins

### **Year 2-3 (2027-2028)**: Independent Validation
- Multiple labs replicate findings
- Clinical studies begin
- Therapeutic targets identified

### **Year 4-5 (2029-2030)**: Field Transformation
- Textbook paradigm shift
- Clinical trials show efficacy
- Wide academic adoption

### **Year 10+ (2035+)**: Nobel Consideration
- Proven impact on aging biology
- Clinical therapeutics developed
- Fundamental theory validated

---

## üéØ WHY THIS IS NOBEL-WORTHY

### **Nobel Criteria Alignment**

1. **"Discovery" Requirement**:
   - Deterministic aging program (vs random drift)
   - Minimal 5-site bivalent clock
   - Cancer-aging unification

2. **"Greatest Benefit to Humankind"**:
   - Enables aging interventions
   - Cancer prevention strategy
   - Extended healthspan therapeutics

3. **"Fundamental Contribution"**:
   - Redefines aging biology
   - Proves programmability
   - Opens new research field

### **Historical Precedents**
- **Horvath (2013)**: First multi-tissue clock ‚Üí highly cited, field-defining
- **Hannum (2013)**: Blood-specific clock ‚Üí mortality prediction
- **Levine (2018)**: PhenoAge ‚Üí biological vs chronological
- **Lu et al (2024)**: DamAge/AdaptAge ‚Üí causal CpGs via EWMR

**This Work**: Combines ALL innovations + proves determinism + <5 sites + cancer link

---

## üîß COMPUTATIONAL RESOURCES NEEDED

### **Software Stack**
- R/Bioconductor: minfi, ChAMP, limma
- Python: scikit-learn, TensorFlow, PyTorch
- CRISPR design: Benchling, CHOPCHOP
- Visualization: ggplot2, Seaborn, GraphPad

### **Computing Requirements**
- HPC cluster for ML training
- GPU for deep learning models
- Cloud storage for multi-omics data
- High RAM for single-cell analysis (>128GB)

### **Biological Resources**
- Primary CD4+/CD14+ cells from young/old donors
- CRISPR reagents for epigenome editing
- Sequencing capacity: Bisulfite-seq, ATAC-seq, ChIP-seq, RNA-seq
- Flow cytometry for cell sorting

---

## üìö KEY LITERATURE TO MASTER

### **Core Papers**
1. Teschendorff (2010) - Original GSE20236 study
2. Horvath (2013) - DNA methylation age
3. Hannum (2013) - Blood aging clock
4. Levine (2018) - PhenoAge
5. Lu (2024) - DamAge/AdaptAge causal clocks
6. Cheishvili (2025) - EpiAge 3-site ELOVL2 clock

### **Bivalent Chromatin Biology**
- Bernstein (2006) - Bivalent domains discovery
- Voigt (2013) - Polycomb in development
- Margueron (2011) - EZH2 mechanism
- Easwaran (2012) - Cancer methylation at bivalent sites

### **Epigenetic Reprogramming**
- Ocampo (2016) - OSK rejuvenation
- Horvath (2018) - Epigenetic aging reversal
- Sinclair (2023) - Cellular reprogramming

---

## üé¨ FINAL CHECKLIST FOR BREAKTHROUGH

- [ ] **Performance**: R>0.98, MAE<1.5, <5 CpGs
- [ ] **Biology**: Bivalent chromatin mechanistic explanation
- [ ] **Causality**: CRISPR validation experiments complete
- [ ] **Determinism**: Mathematical proof of non-stochastic
- [ ] **Multi-tissue**: Works in blood, CD4+, CD14+, buccal
- [ ] **Cancer link**: Predicts transformation risk
- [ ] **Therapeutic**: Demonstrates reversibility
- [ ] **External validation**: ‚â•3 independent datasets
- [ ] **Clinical utility**: Mortality/morbidity prediction
- [ ] **Publication**: Nature/Science/Cell quality manuscript

---

## üí≠ PARADIGM SHIFT STATEMENT

**Current Paradigm**: "Aging is accumulation of random damage across thousands of genomic sites"

**New Paradigm**: "Aging is deterministic reprogramming of 3-5 bivalent chromatin master regulators that can be precisely measured and therapeutically reversed"

This is the difference between incremental progress and Nobel-worthy revolution.

---

## üèÅ BREAKTHROUGH PROBABILITY ASSESSMENT

**Technical Feasibility**: 85% (data exists, methods proven)
**Biological Plausibility**: 90% (bivalent biology well-established)
**Clinical Impact**: 95% (clear therapeutic targets)
**Nobel Recognition**: 30% (if all criteria met + 10-year validation)

**Overall Success Probability**: **70%** for major breakthrough
**Nobel Probability**: **30%** (contingent on long-term impact)

---

*"The goal is not to predict age with 353 CpGs. The goal is to prove that aging follows a deterministic program governed by 3-5 bivalent chromatin sites that we can measure, understand, and ultimately control."*

**Let's rewrite the biology of aging. üß¨‚è∞üèÜ**

2. Target Metrics for a 2026 Breakthrough¬†To move beyond "incremental" research and achieve a true computational breakthrough on this specific dataset, you should aim for:¬†MAE (Mean Absolute Error) < 2.0 years: Reducing error to under 2 years on a narrow-range female cohort would demonstrate a highly refined "deterministic" model of aging.Correlation (\(R\)) > 0.97: Shattering the typical 0.94 ceiling for the 27k platform by identifying non-linear patterns or multi-omics interactions.Feature Compression < 10 CpGs: Standard models use 71 to 350+ sites. Achieving the above accuracy with fewer than 10 highly predictive CpG sites (e.g., ELOVL2 variants) would be a "minimalist" breakthrough.¬†3. Strategic "Breakthrough" Angles¬†Multi-Tissue Validation: The unique value of GSE20236 is its replication in CD4+ T-cells, CD14+ monocytes, and buccal cells. A model that maintains MAE < 2.5 across all these tissues simultaneously would be groundbreaking.Pathological Linkage: Focus on "Bivalent Chromatin Domain Promoters" identified in the original study. A breakthrough could involve proving these specific 27k sites accurately predict biological age acceleration better than chronological age.Deterministic Wave vs. Drift: Proving that the aging signal at these specific 27k sites is deterministic rather than stochastic (random "drift") would fundamentally change the understanding of the "epigenetic clock".¬†Metric¬†Current SOTABreakthrough TargetMAE3.5 ‚Äì 5.0 years< 2.0 yearsCorrelation (\(R\))0.85 ‚Äì 0.94> 0.97Site Count71 ‚Äì 353 CpGs< 10 CpGs