In [None]:
# ==============================================================================
#  THE IMMORTALITY PROTOCOL: HRF TITAN-26 (KAGGLE P100 EDITION)
#  SYSTEM: NVIDIA TESLA P100 (16GB HBM2) | RAM: 30GB
#  TARGET: MAE < 1.0 YEAR
# ==============================================================================

import sys
import subprocess
import gc
import os
import numpy as np
import pandas as pd

# --- 1. RAPIDS & ENVIRONMENT CHECK ---
print("‚ö° SYSTEM DIAGNOSTICS:")
try:
    import cuml
    import cupy as cp
    import cudf
    gpu_name = subprocess.check_output(["nvidia-smi", "-L"]).decode("utf-8").strip()
    print(f"   ‚úÖ GPU DETECTED: {gpu_name}")
    print(f"   ‚úÖ RAPIDS VERSION: {cuml.__version__}")
    
    # Check if we are truly on P100
    if "P100" in gpu_name:
        print("   üöÄ PERFORMANCE MODE: P100 HBM2 BANDWIDTH ACTIVE.")
    else:
        print("   ‚ö†Ô∏è WARNING: You are running on T4. Switch to P100 in Settings for 2x speed.")
except ImportError:
    print("   ‚ùå CRITICAL: RAPIDS not found. Ensure Accelerator is set to GPU P100.")

# --- 2. INSTALL BIOLOGY STACK ---
# Kaggle has RAPIDS, but needs GEOparse
print("\nüì¶ INSTALLING BIO-INFORMATICS LAYER...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "GEOparse", "fastparquet"])

import GEOparse

# --- 3. MEMORY-SAFE DATA LOADER (THE "PIVOT KILLER") ---
def load_hannum_optimized():
    """
    Loads GSE40279 with RAM Safety Protocols.
    Drops the massive GEO object immediately after extraction to prevent OOM.
    """
    print("\nüß¨ DOWNLOADING GSE40279 (HANNUM DATASET)...")
    # Download to local directory
    try:
        gse = GEOparse.get_GEO(geo="GSE40279", destdir="./", silent=True)
    except Exception as e:
        print(f"   ‚ùå DOWNLOAD ERROR: {e}")
        return None, None

    print("   ‚úÖ Download Complete. Extracting Metadata...")
    
    # 1. Extract Targets (Age)
    meta = gse.phenotype_data
    # Search for age column safely
    age_col = next((c for c in meta.columns if 'age' in c.lower()), None)
    if age_col:
        y = meta[age_col].astype('float32').values # float32 saves 50% RAM
        print(f"   -> Target Extracted: {len(y)} Samples (Age {y.min()}-{y.max()})")
    else:
        raise ValueError("Age column not found!")

    # 2. Extract Data & PURGE RAM
    print("   -> EXTRACTING METHYLATION MATRIX (May spike RAM to 12GB+)...")
    # We extract the table first
    df_temp = gse.pivot_samples('VALUE')
    
    # 3. CRITICAL: KILL THE GSE OBJECT
    print("   -> üßπ PURGING RAW FILES FROM RAM...")
    del gse
    gc.collect() # Force Python to release memory instantly
    
    # 4. Transpose & Optimize
    print("   -> Transposing & Casting to Float32...")
    X = df_temp.T.astype('float32') # Crucial for P100 speed
    
    # Cleanup temp dataframe
    del df_temp
    gc.collect()
    
    print(f"\nüìä FINAL DATA SHAPE: {X.shape}")
    print(f"   [RAM USAGE OPTIMIZED] - Ready for Titan-26")
    
    return X, y

if __name__ == "__main__":
    # Execute the loader
    X, y = load_hannum_optimized()
    
    # Preview
    print("\nüîç SAMPLE BETA VALUES (CpG Sites):")
    print(X.iloc[:3, :5])

‚ö° SYSTEM DIAGNOSTICS:


  if entities is not ():


   ‚úÖ GPU DETECTED: GPU 0: Tesla T4 (UUID: GPU-5225ca79-692c-9b7f-a438-75a28ab48138)
GPU 1: Tesla T4 (UUID: GPU-e609f12d-2655-8b3a-ea38-6ebf6dabb3ed)
   ‚úÖ RAPIDS VERSION: 25.06.00

üì¶ INSTALLING BIO-INFORMATICS LAYER...
Collecting GEOparse
  Downloading GEOparse-2.0.4-py3-none-any.whl.metadata (6.5 kB)
Collecting fastparquet
  Downloading fastparquet-2025.12.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading GEOparse-2.0.4-py3-none-any.whl (29 kB)
Downloading fastparquet-2025.12.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 1.8/1.8 MB 24.2 MB/s eta 0:00:00
Installing collected packages: GEOparse, fastparquet
Successfully installed GEOparse-2.0.4 fastparquet-2025.12.0

üß¨ DOWNLOADING GSE40279 (HANNUM DATASET)...


  return read_csv(StringIO(data), index_col=None, sep="\t")


   ‚úÖ Download Complete. Extracting Metadata...
   -> Target Extracted: 656 Samples (Age 19.0-101.0)
   -> EXTRACTING METHYLATION MATRIX (May spike RAM to 12GB+)...


In [7]:
# ==============================================================================
#  THE IMMORTALITY PROTOCOL: TITAN-26 [CORE-5K INITIALIZATION]
#  TARGET: MAE < 1.0 YEAR | R > 0.99 (DETERMINISTIC WAVE DECODING)
#  HARDWARE: NVIDIA T4 (KAGGLE/COLAB 2026)
# ==============================================================================

import sys, os, subprocess, gc
print("‚ö° INITIATING TITAN-26 HYPER-SPEED ENVIRONMENT...")

# --- 1. NVIDIA RAPIDS AUTO-INSTALL (T4 OPTIMIZED) ---
try:
    import cuml, cudf, cupy as cp
    print("   ‚úÖ RAPIDS ENGINE DETECTED.")
except ImportError:
    print("   ‚ö†Ô∏è RAPIDS ENGINE MISSING. INSTALLING (90s)...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "cudf-cu12", "cuml-cu12", 
                           "--extra-index-url=https://pypi.nvidia.com", "--no-cache-dir", "-q"])
    import cuml, cudf, cupy as cp
    print("   ‚úÖ INSTALLATION SUCCESSFUL.")

# --- 2. GPU MEMORY SWAP PROTECTION ---
# This ensures that even if we hit VRAM limits, the P100/T4 uses managed memory.
cp.cuda.set_allocator(cp.cuda.MemoryPool(cp.cuda.malloc_managed).malloc)

# --- 3. IMPORT SCIENTIFIC STACK ---
from cuml.linear_model import ElasticNet, Ridge
from cuml.metrics import mean_absolute_error, r2_score
from cuml.model_selection import train_test_split
import pandas as pd
import numpy as np

# --- 4. CONFIGURATION FOR NOBEL-TIER ACCURACY ---
# Using the Top 5k features eliminates 'epigenetic noise' and prevents RAM crashes.
PROTOCAL_CONFIG = {
    "target_features": 5000,
    "precision": "float32",
    "gpu_managed": True,
    "seed": 26  # Harmonic Constant
}

print(f"\nüåü SYSTEM READY:")
print(f"   [ACCELERATOR]: {subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip()}")
print(f"   [PROTOCOL]: CORE-5K (Deterministic Filter)")
print(f"   [MEMORY]: MANAGED POOL ACTIVE")
print(f"\n‚úÖ PROCEED TO CELL 2: LOAD SLIM PARQUET (GSE40279_Core_5k.parquet)")


‚ö° INITIATING TITAN-26 HYPER-SPEED ENVIRONMENT...
   ‚úÖ RAPIDS ENGINE DETECTED.

üåü SYSTEM READY:
   [ACCELERATOR]: GPU 0: Tesla T4 (UUID: GPU-bf0ddeb4-4fd8-9c5b-c358-f22fb5428750)
GPU 1: Tesla T4 (UUID: GPU-25e3ef52-22f2-ef08-0ff8-940338480edc)
   [PROTOCOL]: CORE-5K (Deterministic Filter)
   [MEMORY]: MANAGED POOL ACTIVE

‚úÖ PROCEED TO CELL 2: LOAD SLIM PARQUET (GSE40279_Core_5k.parquet)


In [5]:
# ==============================================================================
#  THE IMMORTALITY PROTOCOL: BRAIN CLOCK FIX (GSE74193)
#  STATUS: DEBUGGING & REPAIRING
# ==============================================================================

import pandas as pd
import numpy as np
import requests
import gzip
import re
import os
import gc
import cupy as cp

def load_brain_clock_robust(target_k=3000):
    print("‚ö° CONNECTING TO GSE74193 (FIXED MODE)...")
    
    # 1. DOWNLOAD
    url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74193/matrix/GSE74193_series_matrix.txt.gz"
    local_filename = "GSE74193_Brain.txt.gz"
    
    if not os.path.exists(local_filename):
        print("   -> Downloading...")
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    else:
        print("   -> Found Local File.")

    # 2. EXTRACT AGES (Handling Quotes)
    print("   -> Parsing Header for Brain Ages...")
    y_ages = []
    
    with gzip.open(local_filename, 'rt', encoding='latin-1') as f:
        for line in f:
            # Look for line with "age" or "Age" inside !Sample_characteristics
            if "!Sample_characteristics_ch1" in line and ("age" in line.lower() or "Age" in line):
                parts = line.strip().split('\t')[1:]
                for p in parts:
                    # Remove quotes first: "Age: 45" -> Age: 45
                    clean_p = p.replace('"', '').strip()
                    # Find number: matches 45, 45.0, etc.
                    match = re.search(r"(\d+\.?\d*)", clean_p)
                    if match:
                        y_ages.append(float(match.group(1)))
                    else:
                        y_ages.append(np.nan)
                
                # If we found enough numbers, we assume this is the age line and stop
                if len(y_ages) > 10: 
                    break 
                else:
                    y_ages = [] # Reset if this was just a junk line

    y = np.array(y_ages, dtype=np.float32)
    
    # Fill NaNs with mean age to prevent crash
    if np.isnan(y).any():
        print(f"   ‚ö†Ô∏è Found {np.isnan(y).sum()} missing ages. Filling with mean.")
        y = np.nan_to_num(y, nan=np.nanmean(y))
        
    print(f"   -> ‚úÖ Target Acquired: {len(y)} Brain Samples. (Age {y.min():.1f} - {y.max():.1f})")

    # 3. LOAD MATRIX (Manual Skip)
    print("   -> locating Table Start...")
    
    # Find the line number where the data starts
    skip_count = 0
    with gzip.open(local_filename, 'rt', encoding='latin-1') as f:
        for i, line in enumerate(f):
            if "!series_matrix_table_begin" in line:
                skip_count = i + 1 # Data starts after this line
                break
    
    print(f"   -> Loading Methylation Matrix (Skipping {skip_count} lines)...")
    # Read CSV skipping the metadata manually. We do NOT use comment='!' here to avoid skipping data.
    X_raw = pd.read_csv(local_filename, sep='\t', index_col=0, header=0, skiprows=skip_count, compression='gzip', engine='c')
    
    # Drop the LAST row if it is "!series_matrix_table_end"
    if X_raw.index[-1].startswith("!"):
        X_raw = X_raw.iloc[:-1]

    print("   -> Transposing...")
    X = X_raw.T.values.astype('float32')
    
    del X_raw
    gc.collect()

    # 4. DEATH RAY (GPU Filter)
    print(f"   -> üéØ FIRING DEATH RAY SNIPER (Filtering {X.shape[1]} sites)...")
    
    X = np.nan_to_num(X, nan=0.5)
    X_gpu = cp.asarray(X)
    y_gpu = cp.asarray(y)
    
    # Correlation
    X_mean = cp.mean(X_gpu, axis=0)
    y_mean = cp.mean(y_gpu)
    numerator = cp.sum((X_gpu - X_mean) * (y_gpu - y_mean)[:, None], axis=0)
    denominator = cp.sqrt(cp.sum((X_gpu - X_mean)**2, axis=0) * cp.sum((y_gpu - y_mean)**2))
    correlations = cp.abs(numerator / (denominator + 1e-9))
    
    top_indices = cp.argsort(correlations)[-target_k:]
    top_indices = cp.sort(top_indices)
    
    X_core = X_gpu[:, top_indices]
    
    print(f"   -> ‚úÖ READY. Shape: {X_core.shape}")
    return X_core, y_gpu

if __name__ == "__main__":
    X, y = load_brain_clock_robust(target_k=3000)

‚ö° CONNECTING TO GSE74193 (FIXED MODE)...
   -> Found Local File.
   -> Parsing Header for Brain Ages...
   -> ‚úÖ Target Acquired: 675 Brain Samples. (Age 0.0 - 97.0)
   -> locating Table Start...
   -> Loading Methylation Matrix (Skipping 91 lines)...
   -> Transposing...
   -> üéØ FIRING DEATH RAY SNIPER (Filtering 0 sites)...
   -> ‚úÖ READY. Shape: (675, 0)


In [13]:
pip install GEOparse


Collecting GEOparse
  Downloading GEOparse-2.0.4-py3-none-any.whl.metadata (6.5 kB)
Downloading GEOparse-2.0.4-py3-none-any.whl (29 kB)
Installing collected packages: GEOparse
Successfully installed GEOparse-2.0.4
Note: you may need to restart the kernel to use updated packages.


In [15]:
# ==============================================================================
#  THE IMMORTALITY PROTOCOL: BRUTE FORCE ALIGNMENT
#  STRATEGY: Manual ID-to-Age Mapping (Bypasses all library errors)
#  HARDWARE: T4 GPU
# ==============================================================================

import pandas as pd
import numpy as np
import requests
import gzip
import os
import gc
import re
import cupy as cp
from cuml.neighbors import NearestNeighbors as cuNN
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split

# ------------------------------------------------------------------------------
# 1. THE BRUTE FORCE LOADER (MANUAL MAPPING)
# ------------------------------------------------------------------------------
def load_brain_data_brute_force(target_k=3000):
    print("‚ö° STEP 1: DOWNLOADING & MAPPING DATA MANUALLY...")
    
    url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74193/matrix/GSE74193_series_matrix.txt.gz"
    local_filename = "GSE74193_Brain.txt.gz"
    
    if not os.path.exists(local_filename):
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # --- PHASE A: BUILD THE MAP (Header Scan) ---
    print("   -> Scanning Header for Sample IDs and Ages...")
    sample_ids = []
    ages = []
    
    with gzip.open(local_filename, 'rt', encoding='latin-1') as f:
        for line in f:
            # 1. Capture Sample IDs (GSMxxxx)
            if "!Sample_geo_accession" in line:
                parts = line.strip().replace('"', '').split('\t')[1:]
                sample_ids = parts
                
            # 2. Capture Ages
            if "!Sample_characteristics_ch1" in line and ("age" in line.lower() or "Age" in line):
                parts = line.strip().replace('"', '').split('\t')[1:]
                # Extract numbers from strings like "Age: 45"
                line_ages = []
                for p in parts:
                    match = re.search(r"(\d+\.?\d*)", p)
                    if match: line_ages.append(float(match.group(1)))
                    else: line_ages.append(np.nan)
                
                # Only keep this line if it looks like the Age line
                if len(line_ages) > 0 and np.nanmean(line_ages) > 0:
                    ages = line_ages

            if "!series_matrix_table_begin" in line:
                break
    
    # Create the Master Map
    if len(sample_ids) != len(ages):
        print(f"   ‚ö†Ô∏è Mismatch: {len(sample_ids)} IDs vs {len(ages)} Ages. Truncating to minimum.")
        min_len = min(len(sample_ids), len(ages))
        sample_ids = sample_ids[:min_len]
        ages = ages[:min_len]

    id_to_age = dict(zip(sample_ids, ages))
    print(f"   -> Map Created: {len(id_to_age)} Samples Mapped.")

    # --- PHASE B: LOAD MATRIX & ALIGN ---
    print("   -> Loading Data Table (Skipping Metadata)...")
    
    # Find start line
    skip_rows = 0
    with gzip.open(local_filename, 'rt', encoding='latin-1') as f:
        for i, line in enumerate(f):
            if "!series_matrix_table_begin" in line:
                skip_rows = i + 1
                break
                
    # Load Data
    df = pd.read_csv(local_filename, sep='\t', index_col=0, header=0, 
                     skiprows=skip_rows, compression='gzip', engine='c')
    
    # Drop last row if garbage
    if isinstance(df.index[-1], str) and df.index[-1].startswith("!"):
        df = df.iloc[:-1]
        
    print(f"   -> Raw Matrix Shape: {df.shape}")

    # --- PHASE C: ALIGNMENT ---
    print("   -> Aligning Data columns to Age Map...")
    # Only keep columns that are in our map
    valid_cols = [c for c in df.columns if c in id_to_age]
    df = df[valid_cols]
    
    # Create Target Vector y based on column order
    y = np.array([id_to_age[c] for c in df.columns], dtype=np.float32)
    
    # Fill NaNs in Age (Fetal data usually -1 or Nan)
    y = np.nan_to_num(y, nan=0.0)
    
    # Transpose to (Samples x Features)
    X = df.T.values.astype('float32')
    
    # Clean RAM
    del df
    gc.collect()

    # --- PHASE D: DEATH RAY (GPU) ---
    print(f"   -> üéØ FIRING DEATH RAY on {X.shape} Matrix...")
    
    # Fill Data NaNs
    X = np.nan_to_num(X, nan=0.5)
    
    X_gpu = cp.asarray(X)
    y_gpu = cp.asarray(y)
    
    # Variance Filter (Faster/Safer than Correlation for first pass)
    # We select features that actually CHANGE across samples
    print("   -> Selecting High-Variance Features...")
    variances = cp.var(X_gpu, axis=0)
    top_indices = cp.argsort(variances)[-target_k:]
    top_indices = cp.sort(top_indices)
    
    X_core = cp.asnumpy(X_gpu[:, top_indices])
    
    print(f"   -> ‚úÖ SUCCESS. Ready for Titan. Shape: {X_core.shape}")
    return X_core, y

# ------------------------------------------------------------------------------
# 2. THE TITAN-26 MODEL (GPU KERNELS)
# ------------------------------------------------------------------------------
class HarmonicResonanceRegressor_v15(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.base_freq = 10.0
        self.gamma = 0.5
        self.n_neighbors = 5
        self.scaler_ = RobustScaler()

    def fit(self, X, y):
        # CPU Pre-processing
        X_scaled = self.scaler_.fit_transform(X)
        X_clip = np.clip(X_scaled, 0, 1)
        diffs = np.diff(X_clip, axis=1)
        coherence = np.var(X_clip, axis=1).reshape(-1, 1)
        self.X_train_ = np.hstack([X_clip, diffs, coherence])
        self.y_train_ = y
        return self

    def predict(self, X):
        X_scaled = self.scaler_.transform(X)
        X_clip = np.clip(X_scaled, 0, 1)
        diffs = np.diff(X_clip, axis=1)
        coherence = np.var(X_clip, axis=1).reshape(-1, 1)
        X_holo = np.hstack([X_clip, diffs, coherence])
        
        # Handshake: CPU -> GPU
        return self._predict_gpu(self.X_train_, self.y_train_, X_holo)

    def _predict_gpu(self, X_tr, y_tr, X_q):
        # Move to T4
        X_tr_g = cp.asarray(X_tr)
        y_tr_g = cp.asarray(y_tr)
        X_q_g = cp.asarray(X_q)
        
        knn = cuNN(n_neighbors=self.n_neighbors)
        knn.fit(X_tr_g)
        dists, indices = knn.kneighbors(X_q_g)
        
        # Harmonic Kernel
        w = cp.exp(-self.gamma * dists**2.5) * (1.0 + cp.cos(self.base_freq * dists))
        
        neighbors_y = y_tr_g[indices]
        preds = cp.sum(w * neighbors_y, axis=1) / (cp.sum(w, axis=1) + 1e-9)
        return cp.asnumpy(preds)

# ------------------------------------------------------------------------------
# 3. EXECUTION
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Load
    X, y = load_brain_data_brute_force(target_k=3000)
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    
    print("\n‚öîÔ∏è  TRAINING TITAN-26 FOREST...")
    model = BaggingRegressor(
        estimator=HarmonicResonanceRegressor_v15(),
        n_estimators=40, # High precision
        max_samples=0.7,
        n_jobs=1,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    print("üîÆ PREDICTING...")
    preds = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    print("\n" + "="*50)
    print(f"FINAL RESULT (BRAIN CLOCK)")
    print("="*50)
    print(f"MAE (Error): {mae:.4f} Years")
    print(f"R2 Score:    {r2:.4f}")
    print("="*50)
    
    if mae < 2.0:
        print("üèÜ STATUS: NOBEL-TIER BREAKTHROUGH.")
    else:
        print("‚úÖ STATUS: SUCCESSFUL RUN.")

‚ö° STEP 1: DOWNLOADING & MAPPING DATA MANUALLY...
   -> Scanning Header for Sample IDs and Ages...
   -> Map Created: 675 Samples Mapped.
   -> Loading Data Table (Skipping Metadata)...
   -> Raw Matrix Shape: (0, 675)
   -> Aligning Data columns to Age Map...
   -> üéØ FIRING DEATH RAY on (675, 0) Matrix...
   -> Selecting High-Variance Features...
   -> ‚úÖ SUCCESS. Ready for Titan. Shape: (675, 0)

‚öîÔ∏è  TRAINING TITAN-26 FOREST...


ValueError: Found array with 0 feature(s) (shape=(573, 0)) while a minimum of 1 is required by BaggingRegressor.