In [None]:
"""
Preprocessing for Tau Protein Misfolding Prediction

This notebook:
1. Loads processed sequences and labels
2. Creates train/val/test splits
3. Generates ProtBERT embeddings
4. Creates integer-encoded sequences
5. Saves all preprocessed data
"""

import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import torch
from tqdm.notebook import tqdm

# Import our utilities
from utils import (
    load_core_tables,
    make_splits,
    save_core_tables,
    compute_protbert_embeddings,
    encode_sequences_to_int,
    create_attention_masks,
    compute_sequence_features,
    save_embeddings_and_arrays,
    SEQUENCES_CSV,
    LABELS_CSV,
    SPLITS_CSV,
    EMBEDDINGS_DIR,
    AMINO_ACID_VOCAB,
    DEVICE,
    TRAIN_RATIO,
    VAL_RATIO,
    TEST_RATIO,
)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ Imports successful!")
print(f"Device: {DEVICE}")
print(f"Working directory: {Path.cwd()}")


In [None]:
"""
Load sequences and labels from previous notebook
"""

print("=" * 80)
print("STEP 1: LOADING PROCESSED DATA")
print("=" * 80)

# Load sequences and labels
df_sequences = pd.read_csv(SEQUENCES_CSV)
df_labels = pd.read_csv(LABELS_CSV)

print(f"‚úÖ Loaded sequences: {len(df_sequences)} samples")
print(f"‚úÖ Loaded labels: {len(df_labels)} samples")

# Merge
df_data = df_sequences.merge(df_labels, on='protein_id')

print(f"\nüìä Merged dataset:")
print(f"  Total samples: {len(df_data)}")
print(f"  Features: {df_data.columns.tolist()}")

# Display first few rows
print("\nüìã First 5 rows:")
print(df_data.head())

# Check data types
print("\nüîç Data types:")
print(df_data.dtypes)

# Check for duplicates
n_duplicates = df_data['protein_id'].duplicated().sum()
print(f"\nüîç Duplicate protein IDs: {n_duplicates}")


In [None]:
"""
Create stratified train/validation/test splits
"""

print("=" * 80)
print("STEP 2: CREATING DATA SPLITS")
print("=" * 80)

print(f"Split ratios:")
print(f"  Train: {TRAIN_RATIO*100:.0f}%")
print(f"  Val:   {VAL_RATIO*100:.0f}%")
print(f"  Test:  {TEST_RATIO*100:.0f}%")

# Create splits with stratification on labels
df_with_splits = make_splits(
    df_data,
    train_ratio=TRAIN_RATIO,
    val_ratio=VAL_RATIO,
    test_ratio=TEST_RATIO,
    stratify_column='label'
)

# Display split statistics
print(f"\nüìä Split statistics:")
for split_name in ['train', 'val', 'test']:
    split_data = df_with_splits[df_with_splits['split'] == split_name]
    n_samples = len(split_data)
    n_positive = (split_data['label'] == 1).sum()
    n_negative = (split_data['label'] == 0).sum()
    
    print(f"\n{split_name.upper()}:")
    print(f"  Total: {n_samples}")
    print(f"  Label 0 (Normal): {n_negative} ({n_negative/n_samples*100:.1f}%)")
    print(f"  Label 1 (Misfolding): {n_positive} ({n_positive/n_samples*100:.1f}%)")

# Visualize splits
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Split sizes
split_counts = df_with_splits['split'].value_counts()
axes[0].bar(split_counts.index, split_counts.values, color=['steelblue', 'orange', 'green'])
axes[0].set_xlabel('Split')
axes[0].set_ylabel('Number of Samples')
axes[0].set_title('Dataset Split Sizes')
axes[0].grid(alpha=0.3, axis='y')

# Label distribution per split
split_label_counts = df_with_splits.groupby(['split', 'label']).size().unstack()
split_label_counts.plot(kind='bar', stacked=True, ax=axes[1], color=['lightgreen', 'salmon'])
axes[1].set_xlabel('Split')
axes[1].set_ylabel('Number of Samples')
axes[1].set_title('Label Distribution per Split')
axes[1].legend(['Normal (0)', 'Misfolding (1)'])
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Save splits
save_core_tables(
    df_sequences=df_with_splits[['protein_id', 'description', 'sequence', 'length', 'species']],
    df_labels=df_with_splits[['protein_id', 'label', 'source']],
    df_splits=df_with_splits[['protein_id', 'split']]
)

print(f"\n‚úÖ Saved splits to: {SPLITS_CSV}")


In [None]:
"""
Generate ProtBERT embeddings for TRAINING set
This may take 10-30 minutes depending on dataset size and device
"""

print("=" * 80)
print("STEP 3: GENERATING PROTBERT EMBEDDINGS (TRAIN)")
print("=" * 80)

# Get training sequences
train_data = df_with_splits[df_with_splits['split'] == 'train']
train_sequences = train_data['sequence'].tolist()

print(f"üî¨ Generating embeddings for {len(train_sequences)} training sequences...")
print(f"‚è±Ô∏è  This may take 10-30 minutes...")
print(f"üíª Using device: {DEVICE}")

# Generate embeddings
cache_path = EMBEDDINGS_DIR / "protbert_train.npy"

train_embeddings = compute_protbert_embeddings(
    sequences=train_sequences,
    batch_size=8,  # Adjust based on your GPU memory
    pooling='mean',
    device=DEVICE,
    cache_path=cache_path,
    use_cache=True  # Will load from cache if exists
)

print(f"\n‚úÖ Training embeddings generated!")
print(f"   Shape: {train_embeddings.shape}")
print(f"   Embedding dimension: {train_embeddings.shape[1]}")
print(f"   Memory: {train_embeddings.nbytes / 1024**2:.1f} MB")

# Visualize embedding statistics
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(train_embeddings.mean(axis=1), bins=50, color='steelblue', edgecolor='black')
plt.xlabel('Mean Embedding Value')
plt.ylabel('Frequency')
plt.title('Distribution of Mean Embedding Values')
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(train_embeddings.std(axis=1), bins=50, color='orange', edgecolor='black')
plt.xlabel('Std Dev of Embedding')
plt.ylabel('Frequency')
plt.title('Distribution of Embedding Std Deviations')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
"""
Generate ProtBERT embeddings for VALIDATION and TEST sets
"""

print("=" * 80)
print("STEP 4: GENERATING PROTBERT EMBEDDINGS (VAL & TEST)")
print("=" * 80)

# Validation set
print("\nüî¨ Generating VALIDATION embeddings...")
val_data = df_with_splits[df_with_splits['split'] == 'val']
val_sequences = val_data['sequence'].tolist()
cache_path_val = EMBEDDINGS_DIR / "protbert_val.npy"

val_embeddings = compute_protbert_embeddings(
    sequences=val_sequences,
    batch_size=8,
    pooling='mean',
    device=DEVICE,
    cache_path=cache_path_val,
    use_cache=True
)

print(f"‚úÖ Validation embeddings: {val_embeddings.shape}")

# Test set
print("\nüî¨ Generating TEST embeddings...")
test_data = df_with_splits[df_with_splits['split'] == 'test']
test_sequences = test_data['sequence'].tolist()
cache_path_test = EMBEDDINGS_DIR / "protbert_test.npy"

test_embeddings = compute_protbert_embeddings(
    sequences=test_sequences,
    batch_size=8,
    pooling='mean',
    device=DEVICE,
    cache_path=cache_path_test,
    use_cache=True
)

print(f"‚úÖ Test embeddings: {test_embeddings.shape}")

print(f"\nüìä Embedding Summary:")
print(f"  Train: {train_embeddings.shape}")
print(f"  Val:   {val_embeddings.shape}")
print(f"  Test:  {test_embeddings.shape}")
print(f"  Total memory: {(train_embeddings.nbytes + val_embeddings.nbytes + test_embeddings.nbytes) / 1024**2:.1f} MB")


In [None]:
"""
Create integer-encoded sequences for CNN-BiLSTM and Transformer models
"""

print("=" * 80)
print("STEP 5: INTEGER ENCODING")
print("=" * 80)

# Determine max sequence length
all_lengths = df_with_splits['length'].values
max_length = int(np.percentile(all_lengths, 95))  # Use 95th percentile to avoid extreme outliers
print(f"Max sequence length (95th percentile): {max_length}")

# Encode train sequences
print("\nüî¢ Encoding TRAIN sequences...")
train_encoded = encode_sequences_to_int(
    sequences=train_sequences,
    vocab=AMINO_ACID_VOCAB,
    max_length=max_length,
    padding='post',
    truncating='post'
)
print(f"‚úÖ Train encoded shape: {train_encoded.shape}")

# Encode validation sequences
print("\nüî¢ Encoding VALIDATION sequences...")
val_encoded = encode_sequences_to_int(
    sequences=val_sequences,
    vocab=AMINO_ACID_VOCAB,
    max_length=max_length,
    padding='post',
    truncating='post'
)
print(f"‚úÖ Val encoded shape: {val_encoded.shape}")

# Encode test sequences
print("\nüî¢ Encoding TEST sequences...")
test_encoded = encode_sequences_to_int(
    sequences=test_sequences,
    vocab=AMINO_ACID_VOCAB,
    max_length=max_length,
    padding='post',
    truncating='post'
)
print(f"‚úÖ Test encoded shape: {test_encoded.shape}")

# Create attention masks
print("\nüé≠ Creating attention masks...")
train_masks = create_attention_masks(train_encoded)
val_masks = create_attention_masks(val_encoded)
test_masks = create_attention_masks(test_encoded)

print(f"‚úÖ Masks created:")
print(f"  Train: {train_masks.shape}")
print(f"  Val:   {val_masks.shape}")
print(f"  Test:  {test_masks.shape}")

# Visualize encoding
plt.figure(figsize=(14, 4))

# Show first sequence encoding
sample_idx = 0
sample_encoded = train_encoded[sample_idx][:100]  # First 100 positions

plt.subplot(1, 2, 1)
plt.imshow(sample_encoded.reshape(1, -1), cmap='viridis', aspect='auto')
plt.colorbar(label='Token ID')
plt.xlabel('Position')
plt.ylabel('Sample')
plt.title('Example: First 100 Positions of Encoded Sequence')

# Token distribution
plt.subplot(1, 2, 2)
unique, counts = np.unique(train_encoded.flatten(), return_counts=True)
plt.bar(unique, counts, color='steelblue', edgecolor='black')
plt.xlabel('Token ID')
plt.ylabel('Frequency')
plt.title('Token Distribution in Training Set')
plt.yscale('log')
plt.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


In [None]:
"""
Compute hand-crafted sequence features
These can be used alongside deep learning features
"""

print("=" * 80)
print("STEP 6: COMPUTING HAND-CRAFTED FEATURES")
print("=" * 80)

# Compute features for all splits
print("\nüî¨ Computing features for all sequences...")

train_features = compute_sequence_features(train_sequences)
val_features = compute_sequence_features(val_sequences)
test_features = compute_sequence_features(test_sequences)

print(f"‚úÖ Feature shapes:")
print(f"  Train: {train_features.shape}")
print(f"  Val:   {val_features.shape}")
print(f"  Test:  {test_features.shape}")

# Display feature statistics
print(f"\nüìä Training set feature statistics:")
print(train_features.describe())

# Visualize features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

features_to_plot = ['length', 'charged_pct', 'hydrophobic_pct', 'aromatic_pct']

for idx, feature in enumerate(features_to_plot):
    ax = axes[idx // 2, idx % 2]
    
    # Plot by label
    train_data_with_label = train_data.copy()
    train_data_with_label[feature] = train_features[feature].values
    
    for label in [0, 1]:
        data = train_data_with_label[train_data_with_label['label'] == label][feature]
        ax.hist(data, bins=30, alpha=0.6, 
                label=f"Label {label} ({'Normal' if label == 0 else 'Misfolding'})",
                edgecolor='black')
    
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution of {feature.replace("_", " ").title()}')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Save features
train_features.to_csv(EMBEDDINGS_DIR / 'features_train.csv', index=False)
val_features.to_csv(EMBEDDINGS_DIR / 'features_val.csv', index=False)
test_features.to_csv(EMBEDDINGS_DIR / 'features_test.csv', index=False)

print(f"\n‚úÖ Saved features to {EMBEDDINGS_DIR}")


In [None]:
"""
Save all preprocessed arrays and embeddings
"""

print("=" * 80)
print("STEP 7: SAVING PREPROCESSED DATA")
print("=" * 80)

# Save encoded sequences
print("\nüíæ Saving encoded sequences...")
np.save(EMBEDDINGS_DIR / 'encoded_train.npy', train_encoded)
np.save(EMBEDDINGS_DIR / 'encoded_val.npy', val_encoded)
np.save(EMBEDDINGS_DIR / 'encoded_test.npy', test_encoded)

# Save attention masks
print("üíæ Saving attention masks...")
np.save(EMBEDDINGS_DIR / 'masks_train.npy', train_masks)
np.save(EMBEDDINGS_DIR / 'masks_val.npy', val_masks)
np.save(EMBEDDINGS_DIR / 'masks_test.npy', test_masks)

# Save labels
print("üíæ Saving labels...")
np.save(EMBEDDINGS_DIR / 'labels_train.npy', train_data['label'].values)
np.save(EMBEDDINGS_DIR / 'labels_val.npy', val_data['label'].values)
np.save(EMBEDDINGS_DIR / 'labels_test.npy', test_data['label'].values)

# Save protein IDs for reference
print("üíæ Saving protein IDs...")
pd.Series(train_data['protein_id'].values).to_csv(EMBEDDINGS_DIR / 'protein_ids_train.csv', index=False, header=['protein_id'])
pd.Series(val_data['protein_id'].values).to_csv(EMBEDDINGS_DIR / 'protein_ids_val.csv', index=False, header=['protein_id'])
pd.Series(test_data['protein_id'].values).to_csv(EMBEDDINGS_DIR / 'protein_ids_test.csv', index=False, header=['protein_id'])

print("\n‚úÖ All data saved successfully!")

# List all saved files
print(f"\nüìÅ Saved files in {EMBEDDINGS_DIR}:")
saved_files = sorted(EMBEDDINGS_DIR.glob('*'))
for f in saved_files:
    size_mb = f.stat().st_size / 1024**2
    print(f"  - {f.name} ({size_mb:.1f} MB)")


In [None]:
"""
Summary and next steps
"""

print("=" * 80)
print("‚úÖ PREPROCESSING COMPLETE!")
print("=" * 80)

print("\nüìä Dataset Summary:")
print(f"  Train samples: {len(train_data)}")
print(f"  Val samples: {len(val_data)}")
print(f"  Test samples: {len(test_data)}")
print(f"  Total samples: {len(df_with_splits)}")

print("\nüì¶ Generated Data:")
print("  1. ProtBERT embeddings (1024-dim)")
print("  2. Integer-encoded sequences")
print("  3. Attention masks")
print("  4. Hand-crafted features")
print("  5. Labels")

print("\nüíæ Storage Summary:")
total_size = sum(f.stat().st_size for f in EMBEDDINGS_DIR.glob('*'))
print(f"  Total preprocessed data: {total_size / 1024**2:.1f} MB")

print("\nüéØ Next Steps:")
print("  ‚Üí Run notebook 03_training.ipynb to:")
print("     - Train Model A (ProtBERT + SVM)")
print("     - Train Model B (ProtBERT Fine-tune)")
print("     - Train Model C (CNN-BiLSTM)")
print("     - Train Model D (Lite Transformer)")
print("     - Generate predictions for stacking")

print("\n" + "=" * 80)
