# GTV trail site split

#### Inclusion criteria
All head and neck cancer patients referred to primary curative or primary palliative radiation treatment with one of the following T-sites:
- Oropharynx
- Hypopharynx
- Supraglottic larynx
- Oral cavity

#### Exclusion criteria
All patients with cancer in the following sites are excluded.
- Vocal cord
- Sinonasal
- Nasal cavity
- Postoperative
- Nasopharyngeal


In [11]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os
    

In [9]:
with open('dataset.json') as file:
    task901_split = json.load(file)



'./imagesTs/HNCDL_003.nii.gz'

In [21]:
def fetch_patient_id(data_list):
    if data_list and isinstance(data_list[0], dict) and 'image' in data_list[0]:
        # Handle training data (list of dictionaries)
        return [os.path.basename(os.path.normpath(entry["image"])).split(".nii.gz")[0] for entry in data_list]
    else:
        # Handle test data (list of strings)
        return [os.path.basename(os.path.normpath(path)).split(".nii.gz")[0] for path in data_list]


test_patients = fetch_patient_id(task901_split['test'])
train_patients = fetch_patient_id(task901_split['training'])
all_patients = train_patients + test_patients
print(len(all_patients))

567


In [40]:
data_sites = pd.read_csv('sites_pre.csv')

all_sites_patient_id = data_sites['PatientID']
filtered_data = data_sites[data_sites['PatientID'].isin(all_patients)]
filtered_data


Unnamed: 0,PatientID,AARR,KOEN,ONSTYPE,PHARYNX,LARYNX,CAVORIS,SINONASA,SPYTKIR,T97,N97,M97,ST97,New_ID
0,HNCDL_001,2017,1,2,1,0,0,0,0,2,1,0,5,HNCDL_001
1,HNCDL_002,2017,1,3,0,0,8,0,0,8,4,0,6,HNCDL_002
2,HNCDL_003,2017,1,2,1,0,0,0,0,5,1,0,5,HNCDL_003
3,HNCDL_004,2015,1,2,1,0,0,0,0,2,3,0,6,HNCDL_004
4,HNCDL_005,2015,1,2,1,0,0,0,0,5,0,0,2,HNCDL_005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,HNCDL_894,2014,1,2,1,0,0,0,0,5,1,0,5,HNCDL_894
886,HNCDL_214,2018,1,1,0,7,0,0,0,5,0,0,2,HNCDL_214
887,HNCDL_058,2016,1,3,0,0,11,0,0,12,5,0,6,HNCDL_058
888,HNCDL_070,2015,2,5,0,0,0,0,1,8,1,0,5,HNCDL_070


In [44]:
import pandas as pd

def filter_patients_for_experiment(data):
    # Criteria for inclusion based on ONSTYPE, LARYNX, and PHARYNX
    condition_onstype_1 = (data['ONSTYPE'] == 1) & (data['LARYNX'] < 7)
    condition_onstype_2 = (data['ONSTYPE'] == 2) & ~((data['PHARYNX'] >= 10) & (data['PHARYNX'] <= 12))
    condition_onstype_3 = (data['ONSTYPE'] == 3)
    
    # Combining conditions for inclusion
    final_condition = condition_onstype_1 | condition_onstype_2 | condition_onstype_3
    
    # Filtering data based on the combined conditions for inclusion
    included_data = data[final_condition]
    
    # Getting the excluded data
    excluded_data = data[~final_condition]
    
    return included_data, excluded_data

included_data, excluded_data = filter_patients_for_experiment(filtered_data)
print(len(included_data), len(excluded_data))

482 85


In [46]:
from sklearn.model_selection import train_test_split

# Split the included_data into training and testing sets (80% train, 20% test), stratifying on ONSTYPE
train_data, test_data = train_test_split(included_data, test_size=0.2, stratify=included_data['ONSTYPE'], random_state=42)
print(len(train_data), len(test_data))


385 97


In [53]:
import json

# Create a dictionary with PatientID lists for train_data, test_data and excluded_data
data_dict = {
    "train": train_data["PatientID"].tolist(),
    "test": test_data["PatientID"].tolist(),
    "other_sites": excluded_data["PatientID"].tolist()
}

# Save the dictionary to a .json file with each PatientID on a new line
with open("data_split_GTV_trail.json", "w") as file:
    json_data = json.dumps(data_dict, indent=4)
    file.write(json_data)