In [None]:
import pandas as pd
import numpy as np

# Read the TSV file
df = pd.read_csv('~/brca_test_data/output/variants_output.tsv', sep='\t')

print("🔍 SEARCHING FOR VUS VARIANTS WITH HIGH SpliceAI SCORES")
print("=" * 70)

# First, let's explore the SpliceAI columns available
spliceai_columns = [col for col in df.columns if 'spliceai' in col.lower()]
print(f"Available SpliceAI columns: {spliceai_columns}")

# Look for VUS classifications in different columns
vus_columns = [col for col in df.columns if any(keyword in col.lower() for keyword in ['classification', 'significance', 'lovd'])]
print(f"\nClassification columns: {vus_columns[:5]}...")  # Show first 5

# Check what VUS-related values exist
print(f"\nTotal variants in dataset: {len(df)}")

# Define VUS criteria - variants that are uncertain/unclassified
def is_vus(row):
    """Check if variant is VUS (Variant of Uncertain Significance)"""
    # Check ENIGMA classification
    enigma = str(row.get('clinical_significance_enigma', '')).lower()
    
    # Check LOVD classification  
    lovd = str(row.get('classification_lovd', '')).lower()
    
    # Check IARC classification
    iarc = str(row.get('iarc_class_exlovd', '')).lower()
    
    # VUS criteria
    vus_indicators = [
        'vus' in lovd,
        'uncertain' in enigma or 'uncertain' in iarc,
        '3 - uncertain' in iarc,
        enigma == '-' and lovd == '-' and iarc == '-'  # Unclassified
    ]
    
    return any(vus_indicators)

# Apply VUS filter
df['is_vus'] = df.apply(is_vus, axis=1)
vus_variants = df[df['is_vus'] == True]

print(f"VUS variants found: {len(vus_variants)}")

# Now check SpliceAI scores
# SpliceAI typically has multiple scores (donor gain, donor loss, acceptor gain, acceptor loss)
spliceai_score_columns = [
    'ds_ag_spliceai',  # donor gain
    'ds_al_spliceai',  # acceptor loss  
    'ds_dg_spliceai',  # donor gain
    'ds_dl_spliceai'   # donor loss
]

# Check which SpliceAI columns actually exist
existing_spliceai_cols = [col for col in spliceai_score_columns if col in df.columns]
print(f"\nExisting SpliceAI score columns: {existing_spliceai_cols}")

if existing_spliceai_cols:
    # Convert SpliceAI scores to numeric, handling any non-numeric values
    for col in existing_spliceai_cols:
        vus_variants[col] = pd.to_numeric(vus_variants[col], errors='coerce')
    
    # Find variants with any SpliceAI score > 0.2
    high_spliceai_mask = False
    for col in existing_spliceai_cols:
        high_spliceai_mask = high_spliceai_mask | (vus_variants[col] > 0.2)
    
    high_spliceai_vus = vus_variants[high_spliceai_mask]
    
    print(f"\nVUS variants with SpliceAI score > 0.2: {len(high_spliceai_vus)}")
    
    if len(high_spliceai_vus) > 0:
        # Create summary table
        summary_cols = ['cdna', 'genomic_hgvs_38', 'chr', 'pos', 'ref', 'alt', 'protein', 
                       'clinical_significance_enigma', 'classification_lovd', 'iarc_class_exlovd'] + existing_spliceai_cols
        
        # Filter to only include columns that exist
        available_cols = [col for col in summary_cols if col in high_spliceai_vus.columns]
        
        summary_table = high_spliceai_vus[available_cols].copy()
        
        # Add max SpliceAI score column
        summary_table['max_spliceai_score'] = high_spliceai_vus[existing_spliceai_cols].max(axis=1)
        
        # Sort by max SpliceAI score (descending)
        summary_table = summary_table.sort_values('max_spliceai_score', ascending=False)
        
        print(f"\nTop 20 VUS variants with highest SpliceAI scores:")
        print("-" * 80)
        
        # Display top 20 results
        display_table = summary_table.head(20)
        
        for idx, row in display_table.iterrows():
            print(f"Variant: {row.get('cdna', 'N/A')}")
            print(f"  Genomic: {row.get('genomic_hgvs_38', 'N/A')}")
            print(f"  Position: chr{row.get('chr', 'N/A')}:{row.get('pos', 'N/A')}")
            print(f"  Change: {row.get('ref', 'N/A')}→{row.get('alt', 'N/A')}")
            print(f"  Protein: {row.get('protein', 'N/A')}")
            print(f"  ENIGMA: {row.get('clinical_significance_enigma', 'N/A')}")
            print(f"  LOVD: {row.get('classification_lovd', 'N/A')}")
            print(f"  IARC: {row.get('iarc_class_exlovd', 'N/A')}")
            print(f"  Max SpliceAI Score: {row.get('max_spliceai_score', 'N/A'):.3f}")
            
            # Show individual SpliceAI scores
            spliceai_scores = []
            for col in existing_spliceai_cols:
                score = row.get(col, np.nan)
                if pd.notna(score):
                    spliceai_scores.append(f"{col}: {score:.3f}")
            if spliceai_scores:
                print(f"  SpliceAI Scores: {', '.join(spliceai_scores)}")
            print()
        
        # Save results
        summary_table.to_csv('./outputs/vus_high_spliceai_variants.csv', index=False)
        print(f"💾 Results saved to: ./outputs/vus_high_spliceai_variants.csv")
        
        # Summary statistics
        print(f"\n📊 SUMMARY STATISTICS:")
        print(f"Total VUS variants with SpliceAI > 0.2: {len(high_spliceai_vus)}")
        print(f"Highest SpliceAI score: {summary_table['max_spliceai_score'].max():.3f}")
        print(f"Average SpliceAI score: {summary_table['max_spliceai_score'].mean():.3f}")
        
        # Chromosome distribution
        chr_dist = high_spliceai_vus['chr'].value_counts()
        print(f"\nChromosome distribution:")
        for chr_num, count in chr_dist.items():
            gene = 'BRCA1' if chr_num == 17 else 'BRCA2' if chr_num == 13 else 'Unknown'
            print(f"  Chr {chr_num} ({gene}): {count}")
            
    else:
        print("No VUS variants found with SpliceAI scores > 0.2")
        
else:
    print("No SpliceAI score columns found in the dataset")
    print("Available columns containing 'splice':")
    splice_cols = [col for col in df.columns if 'splice' in col.lower()]
    for col in splice_cols[:10]:  # Show first 10
        print(f"  {col}")