# oBDS SNOMED Concept Quality Check

This notebook performs quality validation on the unique SNOMED concepts extracted from the oBDS mapping project.

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import os

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Load and Compare Concept Lists from Different Sources

In [None]:
# Load unique concepts from the notebook
print("Loading unique concepts from Jupyter notebook...")
try:
    # This would need to be extracted from the notebook or provided as a list
    notebook_concepts = ['439401001', '900000000000465024', '371480007', '432213005', 
                        '373793006', '58147004', '165197003', '363680008', '423827005',
                        '363679005', '16310003', '76145000', '86481000', '29240004',
                        '259672007', '702666009', '714797009', '261665006', '7771000',
                        '24028007', '51440002', '255561001', '396360001', '275904003',
                        '439272007', '373372005', '263933003', '384812005', '263843001',
                        '263918006', '911753521000003968', '1155705000', '1155708003',
                        '1286893008', '1155707008', '1268929003', '385432009', '444025001',
                        '443527007', '444411008', '1264491009', '404684003', '278201002',
                        '410672004', '734841007', '263486008', '445200009', '262061000',
                        '386053000', '371497001', '385421009', '39607008', '272673000'] # truncated for space
    
    print(f"Loaded {len(notebook_concepts)} concepts from notebook")
except Exception as e:
    print(f"Error loading notebook concepts: {e}")
    notebook_concepts = []

In [None]:
# Load concepts from R script (hardcoded list)
print("Loading unique concepts from R script...")
try:
    r_script_path = "rawData/oBDS/Krippendorff Alpha/Kripp_Taxonomy.R"
    with open(r_script_path, 'r', encoding='utf-8') as f:
        r_content = f.read()
    
    # Extract the uniqueIDs list from R script
    pattern = r'uniqueIDs <- c\((.*?)\)'
    match = re.search(pattern, r_content, re.DOTALL)
    if match:
        ids_string = match.group(1)
        # Parse the quoted strings
        r_concepts = re.findall(r'"([^"]+)"', ids_string)
        print(f"Loaded {len(r_concepts)} concepts from R script")
    else:
        print("Could not find uniqueIDs in R script")
        r_concepts = []
except Exception as e:
    print(f"Error loading R script concepts: {e}")
    r_concepts = []

In [None]:
# Load concepts from distance matrix
print("Loading concepts from distance matrix...")
try:
    distance_matrix_path = "rawData/oBDS/Krippendorff Alpha/oBDS_distance_matrix.csv"
    df_distance = pd.read_csv(distance_matrix_path, index_col=0, nrows=5)  # Just load header and few rows
    matrix_concepts = list(df_distance.columns)
    print(f"Loaded {len(matrix_concepts)} concepts from distance matrix")
except Exception as e:
    print(f"Error loading distance matrix concepts: {e}")
    matrix_concepts = []

In [None]:
# Load concepts from main Excel file (sample - you may need to adjust sheet names)
print("Loading concepts from main Excel file...")
try:
    excel_path = "rawData/oBDS/oBDS_Module_alle_neu.xlsx"
    excel_file = pd.ExcelFile(excel_path)
    
    # Get all unique SNOMED codes from all sheets and mapper columns
    all_excel_concepts = set()
    
    for sheet_name in excel_file.sheet_names[:3]:  # Check first 3 sheets as sample
        try:
            df_sheet = pd.read_excel(excel_path, sheet_name=sheet_name)
            
            # Look for columns that might contain SNOMED codes (typically named with SCTID)
            sctid_columns = [col for col in df_sheet.columns if 'SCTID' in str(col) or 'sct' in str(col).lower()]
            
            for col in sctid_columns:
                concepts = df_sheet[col].dropna().astype(str).tolist()
                # Filter for valid SNOMED format (6-18 digits)
                valid_concepts = [c for c in concepts if re.match(r'\d{6,18}$', c)]
                all_excel_concepts.update(valid_concepts)
                
        except Exception as e:
            print(f"Error processing sheet {sheet_name}: {e}")
    
    excel_concepts = list(all_excel_concepts)
    print(f"Loaded {len(excel_concepts)} concepts from Excel file (sample from first 3 sheets)")
    
except Exception as e:
    print(f"Error loading Excel concepts: {e}")
    excel_concepts = []

## 2. Concept Format Validation

In [None]:
def validate_snomed_format(concept_id):
    """
    Validate SNOMED CT concept ID format.
    Valid SNOMED IDs are 6-18 digits long and follow specific rules.
    """
    if not isinstance(concept_id, str):
        return False, "Not a string"
    
    # Remove whitespace
    concept_id = concept_id.strip()
    
    # Check if it's all digits
    if not concept_id.isdigit():
        return False, "Contains non-digit characters"
    
    # Check length (SNOMED IDs are typically 6-18 digits)
    if len(concept_id) < 6:
        return False, "Too short (< 6 digits)"
    if len(concept_id) > 18:
        return False, "Too long (> 18 digits)"
    
    # Check for leading zeros (unusual in SNOMED)
    if concept_id.startswith('0') and len(concept_id) > 1:
        return False, "Has leading zeros"
    
    return True, "Valid format"

In [None]:
# Combine all concept sources for comprehensive analysis
all_sources = {
    'R_Script': r_concepts,
    'Distance_Matrix': matrix_concepts,
    'Excel_Sample': excel_concepts
}

print("=== CONCEPT SOURCE COMPARISON ===")
for source_name, concepts in all_sources.items():
    print(f"{source_name}: {len(concepts)} concepts")

# Use R script concepts as the primary list for detailed validation
primary_concepts = r_concepts if r_concepts else matrix_concepts
print(f"\nUsing {len(primary_concepts)} concepts from primary source for detailed validation")

In [None]:
# Validate all concepts in primary list
print("=== CONCEPT FORMAT VALIDATION ===")

validation_results = []
for concept in primary_concepts:
    is_valid, reason = validate_snomed_format(concept)
    validation_results.append({
        'concept_id': concept,
        'is_valid': is_valid,
        'issue': reason if not is_valid else None,
        'length': len(str(concept))
    })

df_validation = pd.DataFrame(validation_results)

# Summary statistics
total_concepts = len(df_validation)
valid_concepts = df_validation['is_valid'].sum()
invalid_concepts = total_concepts - valid_concepts

print(f"Total concepts: {total_concepts}")
print(f"Valid format: {valid_concepts} ({valid_concepts/total_concepts*100:.1f}%)")
print(f"Invalid format: {invalid_concepts} ({invalid_concepts/total_concepts*100:.1f}%)")

if invalid_concepts > 0:
    print("\nInvalid concepts:")
    invalid_df = df_validation[~df_validation['is_valid']]
    print(invalid_df[['concept_id', 'issue', 'length']].to_string(index=False))

## 3. Duplicate Detection and Data Quality Issues

In [None]:
print("=== DUPLICATE DETECTION ===")

# Check for exact duplicates
concept_counts = Counter(primary_concepts)
duplicates = {concept: count for concept, count in concept_counts.items() if count > 1}

if duplicates:
    print(f"Found {len(duplicates)} duplicate concepts:")
    for concept, count in duplicates.items():
        print(f"  {concept}: appears {count} times")
else:
    print("No exact duplicates found")

# Check for near-duplicates (concepts that differ only by leading/trailing whitespace)
cleaned_concepts = [str(c).strip() for c in primary_concepts]
original_vs_cleaned = list(zip(primary_concepts, cleaned_concepts))
whitespace_issues = [(orig, clean) for orig, clean in original_vs_cleaned if orig != clean]

if whitespace_issues:
    print(f"\nFound {len(whitespace_issues)} concepts with whitespace issues:")
    for orig, clean in whitespace_issues[:10]:  # Show first 10
        print(f"  '{orig}' -> '{clean}'")
else:
    print("\nNo whitespace issues found")

## 4. Special SNOMED Codes Analysis

In [None]:
print("=== SPECIAL SNOMED CODES ANALYSIS ===")

# Check for special SNOMED codes
special_codes = {
    '900000000000465024': 'SNOMED CT Namespace',
    '900000000000519040': 'SNOMED CT Core',
    '138875005': 'SNOMED CT Concept (root)',
    '900000000000441003': 'SNOMED CT URI',
}

found_special = []
for concept in primary_concepts:
    if concept in special_codes:
        found_special.append((concept, special_codes[concept]))

if found_special:
    print("Found special SNOMED codes:")
    for concept, description in found_special:
        print(f"  {concept}: {description}")
else:
    print("No known special codes found")

# Check for extension concepts (non-international)
# Extension concepts often have specific patterns in their IDs
extension_concepts = []
for concept in primary_concepts:
    if len(concept) > 10:  # Extensions are often longer
        # Check for common extension patterns
        if '1000' in concept[-8:]:  # Common extension pattern
            extension_concepts.append(concept)

print(f"\nPotential extension concepts: {len(extension_concepts)}")
if extension_concepts:
    print("Sample extension concepts:")
    for concept in extension_concepts[:10]:
        print(f"  {concept}")

## 5. Length Distribution Analysis

In [None]:
print("=== CONCEPT ID LENGTH DISTRIBUTION ===")

# Analyze length distribution
lengths = [len(str(concept)) for concept in primary_concepts]
length_counts = Counter(lengths)

print("Length distribution:")
for length in sorted(length_counts.keys()):
    count = length_counts[length]
    percentage = count / len(primary_concepts) * 100
    print(f"  {length} digits: {count} concepts ({percentage:.1f}%)")

# Statistical summary
lengths_array = np.array(lengths)
print(f"\nLength statistics:")
print(f"  Mean: {lengths_array.mean():.1f}")
print(f"  Median: {np.median(lengths_array):.1f}")
print(f"  Min: {lengths_array.min()}")
print(f"  Max: {lengths_array.max()}")
print(f"  Std: {lengths_array.std():.2f}")

## 6. Cross-Source Consistency Check

In [None]:
print("=== CROSS-SOURCE CONSISTENCY ===")

# Compare concepts across different sources
if len(all_sources) > 1:
    source_names = list(all_sources.keys())
    
    for i, source1 in enumerate(source_names):
        for source2 in source_names[i+1:]:
            set1 = set(all_sources[source1])
            set2 = set(all_sources[source2])
            
            intersection = set1 & set2
            union = set1 | set2
            only_in_1 = set1 - set2
            only_in_2 = set2 - set1
            
            jaccard = len(intersection) / len(union) if union else 0
            
            print(f"\n{source1} vs {source2}:")
            print(f"  Common concepts: {len(intersection)}")
            print(f"  Only in {source1}: {len(only_in_1)}")
            print(f"  Only in {source2}: {len(only_in_2)}")
            print(f"  Jaccard similarity: {jaccard:.3f}")
            
            # Show sample differences
            if only_in_1 and len(only_in_1) <= 10:
                print(f"  Concepts only in {source1}: {sorted(list(only_in_1))}")
            if only_in_2 and len(only_in_2) <= 10:
                print(f"  Concepts only in {source2}: {sorted(list(only_in_2))}")
else:
    print("Only one source available - cannot compare")

## 7. Recommendations and Action Items

In [None]:
print("=== QUALITY CHECK SUMMARY AND RECOMMENDATIONS ===")

issues_found = []
recommendations = []

# Check validation results
if 'df_validation' in locals():
    invalid_count = (~df_validation['is_valid']).sum()
    if invalid_count > 0:
        issues_found.append(f"{invalid_count} concepts with invalid format")
        recommendations.append("Review and correct invalid concept IDs")

# Check duplicates
if duplicates:
    issues_found.append(f"{len(duplicates)} duplicate concepts found")
    recommendations.append("Remove duplicate concepts from analysis")

# Check whitespace issues
if 'whitespace_issues' in locals() and whitespace_issues:
    issues_found.append(f"{len(whitespace_issues)} concepts with whitespace issues")
    recommendations.append("Clean whitespace from concept IDs")

# Check extension concepts
if 'extension_concepts' in locals() and extension_concepts:
    extension_pct = len(extension_concepts) / len(primary_concepts) * 100
    if extension_pct > 10:
        issues_found.append(f"{len(extension_concepts)} potential extension concepts ({extension_pct:.1f}%)")
        recommendations.append("Verify extension concepts are intended and available in target SNOMED release")

print("ISSUES FOUND:")
if issues_found:
    for i, issue in enumerate(issues_found, 1):
        print(f"{i}. {issue}")
else:
    print("No major issues detected!")

print("\nRECOMMENDATIONS:")
if recommendations:
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
else:
    print("No specific recommendations - concept list appears clean!")

print("\nNEXT STEPS:")
print("1. Validate concepts against actual SNOMED CT release files")
print("2. Check concept status (active/inactive)")
print("3. Verify concepts exist in the target SNOMED edition/version")
print("4. Create cleaned concept list for final analysis")

## 8. Export Clean Concept List

In [None]:
# Create a cleaned concept list
if 'df_validation' in locals():
    clean_concepts = df_validation[df_validation['is_valid']]['concept_id'].tolist()
else:
    clean_concepts = [str(c).strip() for c in primary_concepts if validate_snomed_format(str(c).strip())[0]]

# Remove duplicates
clean_concepts = list(set(clean_concepts))
clean_concepts.sort()

print(f"Clean concept list: {len(clean_concepts)} unique valid concepts")

# Save to file
output_file = "clean_snomed_concepts.txt"
with open(output_file, 'w') as f:
    for concept in clean_concepts:
        f.write(f"{concept}\n")

print(f"Clean concept list saved to: {output_file}")

# Also save as CSV with additional info
clean_df = pd.DataFrame({
    'concept_id': clean_concepts,
    'length': [len(c) for c in clean_concepts],
    'is_extension': [len(c) > 10 and '1000' in c[-8:] for c in clean_concepts]
})

clean_df.to_csv("clean_snomed_concepts.csv", index=False)
print(f"Clean concept details saved to: clean_snomed_concepts.csv")