In [8]:
# -*- coding: utf-8 -*-
"""
PhenoTagger FastAPI Usage Examples
Examples showing how to use the FastAPI endpoints
"""

import requests
import json
import time

# API base URL (change this to your server URL)
BASE_URL = "http://192.168.5.77:8111"

def test_health_check():
    """Test the health check endpoint"""
    print("=== Health Check ===")
    
    response = requests.get(f"{BASE_URL}/health")
    
    if response.status_code == 200:
        data = response.json()
        print(f"Status: {data['status']}")
        print(f"Model: {data['model_info']}")
        print("✓ API is healthy")
    else:
        print(f"✗ Health check failed: {response.status_code}")
    print()


def batch_annotation(texts, threshold=0.95, only_longest=False, abbr_recognition=True):
    """Test batch text annotation with large dataset"""

    
    data = {
        "texts": texts,
        "threshold": threshold,
        "only_longest": only_longest,
        "abbr_recognition": abbr_recognition
    }
    
    # Make request
    response = requests.post(f"{BASE_URL}/annotate/batch", json=data)
    
    if response.status_code == 200:
        result = response.json()
        print(f"Processed {result['count']} texts in {result['total_processing_time']:.3f}s")
        print(f"Average time per text: {result['total_processing_time']/result['count']:.3f}s")
        return result
    else:
        print(f"✗ Batch annotation failed: {response.status_code}")
        print(response.text)
    print()

def test_configuration():
    """Test configuration endpoint"""
    print("=== Configuration ===")
    
    response = requests.get(f"{BASE_URL}/config")
    
    if response.status_code == 200:
        config = response.json()
        print("Model Info:")
        for key, value in config['model_info'].items():
            print(f"  {key}: {value}")
        
        print("\nEnvironment:")
        env_info = config['environment']
        print(f"  Model Type: {env_info['processing_params']['model_type']}")
        print(f"  Threshold: {env_info['processing_params']['ML_Threshold']}")
        print(f"  Only Longest: {env_info['processing_params']['onlyLongest']}")
        print("✓ Configuration retrieved")
    else:
        print(f"✗ Configuration failed: {response.status_code}")
    print()

def test_parameter_variations():
    """Test different parameter combinations"""
    print("=== Parameter Variations ===")
    
    text = "The patient has developmental delay, seizures, and intellectual disability."
    
    # Test different parameter combinations
    parameter_sets = [
        {"threshold": 0.95, "only_longest": True, "abbr_recognition": False},
        {"threshold": 0.90, "only_longest": False, "abbr_recognition": True},
        {"threshold": 0.85, "only_longest": True, "abbr_recognition": True}
    ]
    
    print(f"Testing text: {text}\n")
    
    for i, params in enumerate(parameter_sets, 1):
        data = {"text": text, **params}
        response = requests.post(f"{BASE_URL}/annotate", json=data)
        
        if response.status_code == 200:
            result = response.json()
            print(f"Test {i} - Parameters: {params}")
            print(f"HPO Terms: {result['hpo_terms']}")
            print(f"HPO IDs: {result['hpo_ids']}")
            print(f"Time: {result['processing_time']:.3f}s")
        else:
            print(f"Test {i} failed: {response.status_code}")
        print()

def test_large_batch():
    """Test very large batch processing"""
    print("=== Large Batch Test ===")
    
    # Generate a large number of texts for stress testing
    base_texts = [
        "Patient has seizures and developmental delay.",
        "Clinical findings include intellectual disability.",
        "Neurological examination shows tremor and ataxia.",
        "The child exhibits growth retardation.",
        "Patient presents with microcephaly and motor delays."
    ]
    
    # Create a batch of 500 texts
    large_batch = base_texts * 100  # 500 texts
    
    print(f"Testing with {len(large_batch)} texts...")
    
    data = {
        "texts": large_batch,
        "threshold": 0.95,
        "only_longest": True
    }
    
    start_time = time.time()
    response = requests.post(f"{BASE_URL}/annotate/batch", json=data)
    request_time = time.time() - start_time
    
    if response.status_code == 200:
        result = response.json()
        print(f"✓ Successfully processed {result['count']} texts")
        print(f"Server processing time: {result['total_processing_time']:.3f}s")
        print(f"Total request time: {request_time:.3f}s")
        print(f"Average time per text: {result['total_processing_time']/result['count']:.4f}s")
        
        # Show sample results
        print("\nSample results:")
        for i, annotation in enumerate(result['results'][:2], 1):
            print(f"{i}. Text: {annotation['text']}")
            print(f"   HPO IDs: {annotation['hpo_ids']}")
        
    else:
        print(f"✗ Large batch test failed: {response.status_code}")
        print(response.text)
    print()

def test_error_handling():
    """Test error handling"""
    print("=== Error Handling ===")

In [2]:
def hpo_reformat(x):
    if x == '-':
        return x
    hpo_list = x.split(';')
    return ';'.join(hpo.replace('_',':') for hpo in hpo_list)

In [3]:
import pandas as pd
biolarkgsc = pd.read_csv('../PhenoBERT/reference/HPO concept recognition/BiolarkGSC/biolarkgsc.csv', sep='\t')
cpod = pd.read_csv('../PhenoBERT/reference/HPO concept recognition/COPD-HPO/copd.csv',sep='\t')
phenochf = pd.read_csv('../PhenoBERT/reference/HPO concept recognition/PhenoCHF/phenochf.csv',sep='\t')

In [5]:
biolarkgsc['dataset'] = 'biolarkgsc'
phenochf['dataset'] = 'phenochf'
cpod['dataset'] = 'copd'

total_dataset = pd.concat([biolarkgsc, cpod, phenochf], ignore_index=True)
total_dataset['labels'] = total_dataset['labels'].fillna('-')
total_dataset['hpo_ids'] = total_dataset['labels'].apply(hpo_reformat)

In [6]:
total_dataset = total_dataset.rename(columns={'text':'clinical_summary'})
total_dataset['clinical_summary'] = total_dataset['clinical_summary'].fillna('-')
total_dataset = total_dataset[['id','clinical_summary','hpo_ids','dataset']]

In [7]:
total_dataset.to_excel('../PhenoBERT/reference/biolarkgsc_copd_phenochf.xlsx', index=False)

In [21]:
dwwwwwaaaaㄊswwwwwwwwwwdf_hpo_mapping = pd.read_csv('../PhenoBERT/reference/2025-08-01_orphanet_WGS_database(HPO_ID_Mapping_v20250506).csv', sep=',')

hpo_mapping_dict = {}
for idx in df_hpo_mapping.index:
    hpo_mapping_dict[df_hpo_mapping.input_hpo_id[idx]] = df_hpo_mapping.mapped_main_id[idx]

def hpo_map(hpo_id_list_str):
    if hpo_id_list_str =='-':
        return '-'
    hpo_id_list = hpo_id_list_str.split(';')
    normalized_hpo_id_list = []
    for hpo_id in hpo_id_list:
        if hpo_id not in normalized_hpo_id_list:
            normalized_hpo_id_list.append(hpo_mapping_dict.get(hpo_id,'-'))
    return ';'.join(normalized_hpo_id_list)

def accuracy_calculator(true_hpo_str, pred_hpo_str):
    true_hpo_list = true_hpo_str.split(';')
    pred_hpo_list = pred_hpo_str.split(';')
    mapped_hpo_list = [hpo for hpo in pred_hpo_list if hpo in true_hpo_list]
    return len(mapped_hpo_list)/len(true_hpo_list)

In [9]:
pheno_tagger_output = batch_annotation(
    total_dataset['clinical_summary'].to_list(),
    threshold=0.95, only_longest=False, abbr_recognition=False)

Processed 2138 texts in 928.413s
Average time per text: 0.434s


In [26]:
pheno_tagger_hpo_ids = [data['hpo_ids'] for data in pheno_tagger_output['results']]

In [28]:
total_dataset['pheno_tagger_hpo_ids'] = pheno_tagger_hpo_ids

In [29]:
total_dataset.head()
total_dataset['normalized_hpo_ids'] = total_dataset['hpo_ids'].apply(hpo_map)

In [30]:
total_dataset['normalized_pheno_tagger_hpo_ids'] = total_dataset['pheno_tagger_hpo_ids'].apply(hpo_map)

In [31]:
total_dataset['normalized_pheno_tagger_hpo_ids']

0       HP:0001156;HP:0008386;HP:0001792;HP:0100266;HP...
1       HP:0000356;HP:0000365;HP:0100258;HP:0001177;HP...
2       HP:0002671;HP:0030731;HP:0010603;HP:0010612;HP...
3                                              HP:0012759
4                                              HP:0000708
                              ...                        
2133    HP:0002754;HP:0001440;HP:0041162;HP:0001058;HP...
2134    HP:0001650;HP:0001653;HP:0005180;HP:0001659;HP...
2135    HP:0006510;HP:0006536;HP:0002088;HP:0000822;HP...
2136    HP:0000819;HP:0100651;HP:0003124;HP:0000822;HP...
2137    HP:0000822;HP:0003124;HP:0000819;HP:0001658;HP...
Name: normalized_pheno_tagger_hpo_ids, Length: 2138, dtype: object

In [20]:
total_dataset['normalized_hpo_ids']

0       HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...
1       HP:0000006,HP:0003828,HP:0003812,HP:0000356,HP...
2       HP:0002671,HP:0000006,HP:0003828,HP:0003812,HP...
3                                   HP:0000707,HP:0001466
4                                   HP:0000708,HP:0003745
                              ...                        
2133    HP:0010741,HP:0001635,HP:0000819,HP:0001677,HP...
2134    HP:0001653,HP:0005180,HP:0005110,HP:0001635,HP...
2135    HP:0012735,HP:0010741,HP:0001513,HP:0030148,HP...
2136    HP:0003124,HP:0002099,HP:0001681,HP:0000819,HP...
2137    HP:0003124,HP:0000819,HP:0001677,HP:0100749,HP...
Name: normalized_hpo_ids, Length: 2138, dtype: object

In [32]:
total_dataset['pheno_tagger_accuracy'] = total_dataset.apply(lambda x:accuracy_calculator(x.normalized_hpo_ids, x.normalized_pheno_tagger_hpo_ids), axis=1)

In [34]:
total_dataset['pheno_tagger_accuracy'].mean()

0.4146743041145691

In [35]:
total_dataset

Unnamed: 0,id,clinical_summary,hpo_ids,dataset,pheno_tagger_hpo_ids,normalized_pheno_tagger_hpo_ids,normalized_hpo_ids,pheno_tagger_accuracy
0,1003450,A syndrome of brachydactyly (absence of some m...,HP:0001156;HP:0009881;HP:0001798;HP:0001792;HP...,biolarkgsc,HP:0001156;HP:0008386;HP:0001792;HP:0100266;HP...,HP:0001156;HP:0008386;HP:0001792;HP:0100266;HP...,HP:0001156;HP:0009881;HP:0001798;HP:0001792;HP...,0.400000
1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,HP:0000006;HP:0000006;HP:0000006;HP:0003828;HP...,biolarkgsc,HP:0000356;HP:0000365;HP:0100258;HP:0001177;HP...,HP:0000356;HP:0000365;HP:0100258;HP:0001177;HP...,HP:0000006;HP:0003828;HP:0003812;HP:0000356;HP...,0.666667
2,10066029,Nevoid basal cell carcinoma syndrome (NBCCS) i...,HP:0002671;HP:0000006;HP:0000006;HP:0000006;HP...,biolarkgsc,HP:0002671;HP:0030731;HP:0002671;HP:0030731;HP...,HP:0002671;HP:0030731;HP:0010603;HP:0010612;HP...,HP:0002671;HP:0000006;HP:0003828;HP:0003812;HP...,0.545455
3,10196695,Angelman syndrome (AS) is a neurodevelopmental...,HP:0000707;HP:0001466,biolarkgsc,HP:0012759,HP:0012759,HP:0000707;HP:0001466,0.000000
4,10417280,Prader-Willi syndrome (PWS) and Angelman syndr...,HP:0000708;HP:0003745,biolarkgsc,HP:0000708,HP:0000708,HP:0000708;HP:0003745,0.500000
...,...,...,...,...,...,...,...,...
2133,989,073187962 | C | 08150399 | | 4062856 | 9/10/20...,HP:0010741;HP:0001635;HP:0000819;HP:0001677;HP...,phenochf,HP:0002754;HP:0002754;HP:0001440;HP:0041162;HP...,HP:0002754;HP:0001440;HP:0041162;HP:0001058;HP...,HP:0010741;HP:0001635;HP:0000819;HP:0001677;HP...,0.714286
2134,991,391076058 | RMH | 18644390 | | 1344198 | 3/23/...,HP:0001653;HP:0005180;HP:0005110;HP:0001635;HP...,phenochf,HP:0001650;HP:0001653;HP:0005180;HP:0001659;HP...,HP:0001650;HP:0001653;HP:0005180;HP:0001659;HP...,HP:0001653;HP:0005180;HP:0005110;HP:0001635;HP...,0.909091
2135,992,117030280 | GM | 64802666 | | 057982 | 1/10/19...,HP:0012735;HP:0010741;HP:0001513;HP:0030148;HP...,phenochf,HP:0006510;HP:0006536;HP:0002088;HP:0000822;HP...,HP:0006510;HP:0006536;HP:0002088;HP:0000822;HP...,HP:0012735;HP:0010741;HP:0001513;HP:0030148;HP...,0.777778
2136,993,303318109 | MMH | 31830318 | | 8371195 | 1/30/...,HP:0003124;HP:0002099;HP:0001681;HP:0000819;HP...,phenochf,HP:0000819;HP:0100651;HP:0003124;HP:0000822;HP...,HP:0000819;HP:0100651;HP:0003124;HP:0000822;HP...,HP:0003124;HP:0002099;HP:0001681;HP:0000819;HP...,0.666667
