In [None]:
import pandas as pd

# Read the TSV file
df = pd.read_csv('~/brca_test_data/output/variants_output.tsv', sep='\\t', low_memory=False)

# Find all variants with the same ref->alt change (C->A)
target_ref = 'C'
target_alt = 'A'

print(f'Searching for all variants with {target_ref}→{target_alt} base change')
print('=' * 60)

# Filter variants with C->A change
c_to_a_variants = df[(df['ref'] == target_ref) & (df['alt'] == target_alt)]

print(f'Found {len(c_to_a_variants)} variants with C→A base change')
print()

# Display key information for each variant
print('Variants with C→A base change:')
print('-' * 40)

for idx, row in c_to_a_variants.iterrows():
    print(f'Variant: {row["cdna"]}')
    print(f'  Genomic position: chr{row["chr"]}:{row["pos"]}')
    print(f'  Genomic HGVS: {row["genomic_hgvs_38"]}')
    print(f'  Protein change: {row["protein"]}')
    
    # Show pathogenicity information if available
    if pd.notna(row.get("clinical_significance_enigma")):
        print(f'  ENIGMA classification: {row["clinical_significance_enigma"]}')
    if pd.notna(row.get("classification_lovd")):
        print(f'  LOVD classification: {row["classification_lovd"]}')
    if pd.notna(row.get("iarc_class_exlovd")):
        print(f'  IARC class: {row["iarc_class_exlovd"]}')
    
    print()

# Summary statistics
print('Summary Statistics:')
print('-' * 20)
print(f'Total C→A variants: {len(c_to_a_variants)}')

# Count by ENIGMA classification
if 'clinical_significance_enigma' in c_to_a_variants.columns:
    enigma_counts = c_to_a_variants['clinical_significance_enigma'].value_counts()
    print('\nENIGMA Classifications:')
    for classification, count in enigma_counts.items():
        if pd.notna(classification):
            print(f'  {classification}: {count}')

# Count by IARC class
if 'iarc_class_exlovd' in c_to_a_variants.columns:
    iarc_counts = c_to_a_variants['iarc_class_exlovd'].value_counts()
    print('\nIARC Classes:')
    for iarc_class, count in iarc_counts.items():
        if pd.notna(iarc_class):
            print(f'  {iarc_class}: {count}')

# Show chromosome distribution
chr_counts = c_to_a_variants['chr'].value_counts()
print('\nChromosome distribution:')
for chromosome, count in chr_counts.items():
    print(f'  Chr {chromosome}: {count}')

In [None]:
import pandas as pd

# Read the TSV file
df = pd.read_csv('~/brca_test_data/output/variants_output.tsv', sep='\t')

# Find all variants with the same ref->alt change (C->A)
target_ref = 'C'
target_alt = 'A'

print(f'Searching for all variants with {target_ref}→{target_alt} base change')
print('=' * 60)

# Filter variants with C->A change
c_to_a_variants = df[(df['ref'] == target_ref) & (df['alt'] == target_alt)]

print(f'Found {len(c_to_a_variants)} variants with C→A base change')
print()

# Display key information for each variant (showing first 20 for readability)
print('Variants with C→A base change (first 20):')
print('-' * 40)

for idx, row in c_to_a_variants.head(20).iterrows():
    print(f'Variant: {row["cdna"]}')
    print(f'  Genomic position: chr{row["chr"]}:{row["pos"]}')
    print(f'  Genomic HGVS: {row["genomic_hgvs_38"]}')
    print(f'  Protein change: {row["protein"]}')
    
    # Show pathogenicity information if available
    if pd.notna(row.get("clinical_significance_enigma")):
        print(f'  ENIGMA classification: {row["clinical_significance_enigma"]}')
    if pd.notna(row.get("classification_lovd")):
        print(f'  LOVD classification: {row["classification_lovd"]}')
    if pd.notna(row.get("iarc_class_exlovd")):
        print(f'  IARC class: {row["iarc_class_exlovd"]}')
    
    print()

# Summary statistics
print('Summary Statistics:')
print('-' * 20)
print(f'Total C→A variants: {len(c_to_a_variants)}')

# Count by ENIGMA classification
if 'clinical_significance_enigma' in c_to_a_variants.columns:
    enigma_counts = c_to_a_variants['clinical_significance_enigma'].value_counts()
    print('\nENIGMA Classifications:')
    for classification, count in enigma_counts.items():
        if pd.notna(classification):
            print(f'  {classification}: {count}')

# Count by IARC class
if 'iarc_class_exlovd' in c_to_a_variants.columns:
    iarc_counts = c_to_a_variants['iarc_class_exlovd'].value_counts()
    print('\nIARC Classes:')
    for iarc_class, count in iarc_counts.items():
        if pd.notna(iarc_class):
            print(f'  {iarc_class}: {count}')

# Show chromosome distribution
chr_counts = c_to_a_variants['chr'].value_counts()
print('\nChromosome distribution:')
for chromosome, count in chr_counts.items():
    print(f'  Chr {chromosome}: {count}')

# Show variant types
if 'variant_type' in c_to_a_variants.columns:
    variant_type_counts = c_to_a_variants['variant_type'].value_counts()
    print('\nVariant types:')
    for variant_type, count in variant_type_counts.items():
        if pd.notna(variant_type):
            print(f'  {variant_type}: {count}')