## Code to split training set into two sets : training and validation clinical notes datasets
NB one patient with two or more different clinical notes should not appear in both groups

In [1]:
from os.path import isfile, isdir, join, basename
from os import listdir
import pandas as pd
import numpy as np
import re
# import matplotlib.pyplot as plt
import collections
import math
# from sklearn import metrics
import random
import shutil 

IN_TXT_DIR = '/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11'

assert isdir(IN_TXT_DIR)


In [2]:
# Read all text files in the dir and split two random datasets : train and val

# extract all ann_files from IN_BRAT_DIR
txt_files = [f for f in listdir(IN_TXT_DIR) if isfile(join(IN_TXT_DIR, f)) if f.endswith('.txt')]

Ident = {}

for txt_file in txt_files:
    txt_path = join(IN_TXT_DIR, txt_file)
    
    # Read ann file
    with open(txt_path, 'r', encoding='utf-8') as f_in:
        lines = f_in.readlines()
        #print("\n\n text_path", txt_file[:-4])
        #print(lines[0])
        # keep track of the patient's NIP
        Ident[txt_file[:-4]]=lines[0]
        

# convert the dictionnary to a Pandas dataframe        
txt_df = pd.DataFrame(list(Ident.items()), columns = ['File', 'Id_Patient'])

# Count number of total clinical notes of the file
print("number of CRH", len(txt_df))

# count number of unique patients
print("Unique Patient", len(txt_df["Id_Patient"].unique()))


number of CRH 151
Unique Patient 126


In [10]:
txt_df.sample(5)

Unnamed: 0,File,Id_Patient
76,CRH_scl_sample_12,-7630614914824983828\n
0,CRH_lupus_sample_26,-5471642527098896913\n
34,CRH_lupus_sample_45,8372841323942329808\n
139,CRH_taka_sample_28,3259816440287859225\n
61,CRH_taka_sample_16,-6895810451220236601\n


In [3]:
# Get a set of all Id of patients : 
Id_unique = set(txt_df['Id_Patient'])
# Check the size
print("Set of unique Patients :", len(Id_unique))

# and get a random set of train and val : 
print('size of validation dataset =',int(17*len(txt_df)/100))
val_patients = random.sample(Id_unique, int(17*len(txt_df)/100))

Set of unique Patients : 126
size of validation dataset = 25


In [6]:
with open('set_id_patient_expe_pheno.txt','w') as f:
    f.write(str(Id_unique))

In [7]:
import ast
with open('set_id_patient_expe_pheno.txt','r') as f:
    my_set = ast.literal_eval(f.read())

In [9]:
Id_unique == my_set

True

In [4]:
# check if it leds to the right number of CRH : 

# get the corresponding files names
list_Val = []
for key, value in Ident.items():
    if value in val_patients:
        # print('validation patient', value)
        list_Val.append(key)


list_Val_2 = [key for key,value in Ident.items() if value in val_patients]

len(list_Val), len(list_Val_2)

(28, 28)

In [5]:
list_Val

['CRH_taka_sample_10',
 'CRH_scl_sample_9',
 'CRH_lupus_sample_14',
 'CRH_sapl_sample_51',
 'CRH_sapl_sample_23',
 'CRH_sapl_sample_11',
 'CRH_sapl_sample_3',
 'CRH_scl_sample_8',
 'CRH_sapl_sample_34',
 'CRH_lupus_sample_52',
 'CRH_scl_sample_36',
 'CRH_lupus_sample_38',
 'CRH_lupus_sample_44',
 'CRH_sapl_sample_50',
 'CRH_taka_sample_32',
 'CRH_scl_sample_16',
 'CRH_sapl_sample_28',
 'CRH_taka_sample_36',
 'CRH_taka_sample_27',
 'CRH_sapl_sample_45',
 'CRH_lupus_sample_5',
 'CRH_scl_sample_17',
 'CRH_taka_sample_42',
 'CRH_scl_sample_50',
 'CRH_taka_sample_13',
 'CRH_lupus_sample_22',
 'CRH_lupus_sample_4',
 'CRH_lupus_sample_10']

In [6]:
# copy the corresponding validation files in the val directory
import shutil 
for i in list_Val: 
    # copy txt files
    source_file = f'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/{i}.txt'
    des_file = f'/export/home/cse200093/Expe_Pheno/data/final_val/{i}.txt'
    shutil.copyfile(source_file,des_file)
    # and they respective .ann files 
    source_ann_file = f'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/{i}.ann'
    des_ann_file = f'/export/home/cse200093/Expe_Pheno/data/final_val/{i}.ann'
    shutil.copyfile(source_ann_file,des_ann_file)
    
source_file

'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/CRH_lupus_sample_10.txt'

In [7]:
# Training files : 
list_Train = [key for key,value in Ident.items() if value not in val_patients]
len(list_Train)

123

In [8]:
# copy the corresponding training files in the train directory

for i in list_Train : 
    # copy txt files
    source_file = f'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/{i}.txt'
    des_file = f'/export/home/cse200093/Expe_Pheno/data/final_train/{i}.txt'
    shutil.copyfile(source_file,des_file)
    # and they respective .ann files 
    source_ann_file = f'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/{i}.ann'
    des_ann_file = f'/export/home/cse200093/Expe_Pheno/data/final_train/{i}.ann'
    shutil.copyfile(source_ann_file,des_ann_file)
    
source_file

'/export/home/cse200093/Expe_Pheno/data/expe_pheno_back_up_26_11/CRH_scl_sample_4.txt'

In [10]:
txt_df.head()

Unnamed: 0,File,Id_Patient
0,CRH_lupus_sample_26,-5471642527098896913\n
1,CRH_taka_sample_1,5830359601606805541\n
2,CRH_scl_sample_30,1772960206674624992\n
3,CRH_lupus_sample_9,-7211275103686507133\n
4,CRH_taka_sample_6,-2210350684834633868\n


In [15]:
# compute thenumber of patients for each disease
taka_df = txt_df[txt_df['File'].str.contains('taka')]
# count number of unique patients
print("Unique Takayasu Patient", len(taka_df["Id_Patient"].unique()))


lupus_df = txt_df[txt_df['File'].str.contains('lupus')]
# count number of unique patients
print("Unique Lupus Patient", len(lupus_df["Id_Patient"].unique()))


scl_df = txt_df[txt_df['File'].str.contains('scl')]
# count number of unique patients
print("Unique scl Patient", len(scl_df["Id_Patient"].unique()))

sapl_df = txt_df[txt_df['File'].str.contains('sapl')]
# count number of unique patients
print("Unique SAPL Patient", len(sapl_df["Id_Patient"].unique()))

Unique Takayasu Patient 26
Unique Lupus Patient 38
Unique scl Patient 32
Unique SAPL Patient 31


In [14]:
taka_df

Unnamed: 0,File,Id_Patient
1,CRH_taka_sample_1,5830359601606805541\n
4,CRH_taka_sample_6,-2210350684834633868\n
5,CRH_taka_sample_10,-7081861870251767961\n
8,CRH_taka_sample_8,6298534508639260711\n
14,CRH_taka_sample_19,6818600308001495885\n
24,CRH_taka_sample_22,-2210350684834633868\n
26,CRH_taka_sample_34,3259816440287859225\n
27,CRH_taka_sample_48,7877605414792008647\n
30,CRH_taka_sample_46,-3932956242127342648\n
42,CRH_taka_sample_23,703906453031200049\n
