In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
import os
import dask.dataframe as dd
from tqdm.auto import tqdm
pd.set_option('display.max_columns', 500)

In [2]:
# Set path to files and read in the CSV file. Use dtyping to save memory and use dask for efficiency.
path = 'C:/Users/outla/Desktop/VA_ML_Research/UCSD_NRD_2017'
os.chdir(path)
cohort_file = dd.read_csv("Nghia_NRD_2017_Cohort.csv", dtype={'age':'float16', 'aweekend':'float16', 'died':'float16', 'discwt':'float64', 'dispuniform':'float16', 'dmonth':'float16', 'dqtr':'float16', 'drg':'float16', 'drgver':'float16', 'drg_nopoa':'float16', 'i10_dx1':str, 'i10_dx2':str, 'i10_dx3':str, 'i10_dx4':str, 'i10_dx5':str, 'i10_dx6':str, 'i10_dx7':str, 'i10_dx8':str, 'i10_dx9':str, 'i10_dx10':str, 'i10_dx11':str, 'i10_dx12':str, 'i10_dx13':str, 'i10_dx14':str, 'i10_dx15':str, 'i10_dx16':str, 'i10_dx17':str, 'i10_dx18':str, 'i10_dx19':str, 'i10_dx20':str, 'i10_dx21':str, 'i10_dx22':str,'i10_dx23':str,'i10_dx24':str, 'i10_dx25':str,'i10_dx26':str,'i10_dx27':str,'i10_dx28':str,'i10_dx29':str, 'i10_dx30':str, 'i10_dx31':str, 'i10_dx32':str, 'i10_dx33':str, 'i10_dx34':str, 'i10_dx35':str, 'i10_dx36':str, 'i10_dx37':str, 'i10_dx38':str, 'i10_dx39':str, 'i10_dx40':str, 'elective':'float16', 'female':'float16', 'hcup_ed':'float16', 'hosp_nrd':'float64', 'los': 'float16', 'mdc':'float16', 'mdc_nopoa':'float16', 'i10_ndx':'float16', 'i10_npr':'float16', 'nrd_daystoevent':'float16', 'nrd_stratum':'float16', 'nrd_visitlink':str, 'pay1':'float16', 'pl_nchs':'float16', 'i10_pr1':str, 'i10_pr2':str, 'i10_pr3':str, 'i10_pr4':str, 'i10_pr5':str, 'i10_pr6':str, 'i10_pr7':str, 'i10_pr8':str, 'i10_pr9':str, 'i10_pr10':str, 'i10_pr11':str, 'i10_pr12':str, 'i10_pr13':str, 'i10_pr14':str, 'i10_pr15':str, 'i10_pr16':str, 'i10_pr17':str, 'i10_pr18':str, 'i10_pr19':str, 'i10_pr20':str, 'i10_pr21':str, 'i10_pr22':str, 'i10_pr23':str, 'i10_pr24':str, 'i10_pr25':str, 'prday1':'float16', 'prday2':'float16', 'prday3':'float16', 'prday4':'float16', 'prday5':'float16', 'prday6':'float16', 'prday7':'float16', 'prday8':'float16', 'prday9':'float16', 'prday10':'float16', 'prday11':'float16', 'prday12':'float16', 'prday13':'float16', 'prday14':'float16', 'prday15':'float16', 'prday16':'float16', 'prday17':'float16', 'prday18':'float16', 'prday19':'float16', 'prday20':'float16', 'prday21':'float16', 'prday22':'float16', 'prday23':'float16', 'prday24':'float16', 'prday25':'float16', 'rehabtransfer':'float16', 'resident':'float16', 'samedayevent':'float16', 'totchg':'float64', 'year':'float16', 'zipinc_qrtl':'float16', 'dxver':'float16', 'prver':'float16', 'preventable_readmission':'float16', 'prev_readm_reason':'float16', 'medical_admission':'float16', 'hfrs_score':'float16', 'hfrs_severity':'float16', 'cc_score':'float16', 'cc_severity':'float16'})

In [3]:
# Number of unique patients in the entire file.
len(np.unique(cohort_file['nrd_visitlink']))

62472

In [4]:
# Now we need to carve out the cohort by removing any patients who are younger than 18 and do not have an IBD diagnosis between January - June 2017 (the first 6 months).
# First we'll create a new dataframe with patients who are adults (ages 18+).
# ibd_file = pd.concat([pd.DataFrame([row],columns = row._fields) for row in tqdm(cohort_file.itertuples(index=False)) if row.age > 17]) # slower than using .loc
ibd_file = cohort_file.loc[cohort_file['age'] > 17]

In [5]:
# Number of unique patients who are aged 18+.
len(np.unique(ibd_file['nrd_visitlink']))

59961

In [6]:
# Create a list of patient ID's who have an IBD related index hospitalization between January - June 2017 (the first 6 months).
patient_list = []
patient_list = [row.nrd_visitlink for row in tqdm(ibd_file.itertuples(index=False)) if row.dmonth < 7 and 'K50' in row.i10_dx1 or row.dmonth < 7 and 'K51' in row.i10_dx1 or row.dmonth < 7 and 'K50' in row.i10_dx2 or row.dmonth < 7 and 'K51' in row.i10_dx2]

114134it [00:02, 40061.17it/s]


In [7]:
# A new list of unique patient ID's from patient_list.
unique_patients = np.unique(patient_list)
len(unique_patients)

31918

In [8]:
# Go through the dataset and use concat to append any rows that have nrd_visitlinks in the unique_patients list to ibd_index_file (a new dataframe) because these are the patients we are looking at.
ibd_index_file = pd.concat([pd.DataFrame([row],columns = row._fields) for row in tqdm(ibd_file.itertuples(index=False)) if row.nrd_visitlink in unique_patients])

114134it [06:39, 285.56it/s]


In [9]:
# Create a new column called 'ibd_diagnosis' and fill every row with a 0 to start.
ibd_index_file['ibd_diagnosis'] = 0

In [10]:
# Reset the index and drop the old index.
ibd_index_file = ibd_index_file.reset_index(drop=True)

In [11]:
# Start with a placeholder for nrd_visitlink that is made up ('aaaaaa'). This will hold the previous rows nrd_visitlink in each iteration.
# Keep a counter for each successive ibd_diagnosis that a patient has, initialized with 1.
# row_num keeps track of which row we are on.
# This loop goes through each row and compares the nrd_visitlink with that of the previous row. 
# If the current and previous nrd_visitlink are not the same and the patient has a ibd related discharge, then a 1 is populated in ibd_diagnosis. Counter reset to 1.
# If the current and previous nrd_visitlink are the same and the patient has a idb related discharge, then the counter increments by 1 and the appropriate number is populated in
# ibd_diagnosis (each subsequent ibd discharge adds 1 to the patients counter).
# If the current and previous nrd_visitlink are the same and the patient does not have an ibd related discharge, then a 0 is populated in ibd_diagnosis.
# If the current and previous nrd_visitlink are not the same and the patient does not have an idb related discharge, then a 0 is populated in ibd_diagnosis. Counter reset to 0.
# id_placeholder is updated in all cases to the current nrd_visitlink.
id_placeholder = 'aaaaaa'
counter = 1
row_num = 0
for row in tqdm(ibd_index_file.itertuples(index=False)):
    id_current = row.nrd_visitlink
    if id_current != id_placeholder and 'K50' in row.i10_dx1 or id_current != id_placeholder and 'K51' in row.i10_dx1 or id_current != id_placeholder and 'K50' in row.i10_dx2 or id_current != id_placeholder and 'K51' in row.i10_dx2:
        ibd_index_file.at[row_num, 'ibd_diagnosis'] = 1
        id_placeholder = id_current
        counter = 1
    elif id_current == id_placeholder and 'K50' in row.i10_dx1 or id_current == id_placeholder and 'K51' in row.i10_dx1 or id_current == id_placeholder and 'K50' in row.i10_dx2 or id_current == id_placeholder and 'K51' in row.i10_dx2:
        counter += 1
        ibd_index_file.at[row_num, 'ibd_diagnosis'] = counter
        id_placeholder = id_current
    elif id_current == id_placeholder and 'K50' not in row.i10_dx1 or id_current == id_placeholder and 'K51' not in row.i10_dx1 or id_current == id_placeholder and 'K50' not in row.i10_dx2 or id_current == id_placeholder and 'K51' not in row.i10_dx2:
        ibd_index_file.at[row_num, 'ibd_diagnosis'] = 0
        id_placeholder = id_current
    else:
        ibd_index_file.at[row_num, 'ibd_diagnosis'] = 0
        id_placeholder = id_current
        counter = 0
    row_num += 1

66073it [00:01, 41502.47it/s]


In [12]:
# Create a new column called 'ibd_index_hospitalization' and fill each row with a 0.
# Use vectorization with mask to find each index hospitalization and fill in a 1 in ibd_index_hospitalization.
ibd_index_file['ibd_index_hospitalization'] = 0
mask = (ibd_index_file['ibd_diagnosis'] == 1)
ibd_index_file.loc[mask, 'ibd_index_hospitalization'] = 1

In [13]:
# Check the file structure.
ibd_index_file.head(10)

Unnamed: 0,age,aweekend,died,discwt,dispuniform,dmonth,dqtr,drg,drgver,drg_nopoa,i10_dx1,i10_dx2,i10_dx3,i10_dx4,i10_dx5,i10_dx6,i10_dx7,i10_dx8,i10_dx9,i10_dx10,i10_dx11,i10_dx12,i10_dx13,i10_dx14,i10_dx15,i10_dx16,i10_dx17,i10_dx18,i10_dx19,i10_dx20,i10_dx21,i10_dx22,i10_dx23,i10_dx24,i10_dx25,i10_dx26,i10_dx27,i10_dx28,i10_dx29,i10_dx30,i10_dx31,i10_dx32,i10_dx33,i10_dx34,i10_dx35,i10_dx36,i10_dx37,i10_dx38,i10_dx39,i10_dx40,elective,female,hcup_ed,hosp_nrd,los,mdc,mdc_nopoa,i10_ndx,i10_npr,nrd_daystoevent,nrd_stratum,nrd_visitlink,pay1,pl_nchs,i10_pr1,i10_pr2,i10_pr3,i10_pr4,i10_pr5,i10_pr6,i10_pr7,i10_pr8,i10_pr9,i10_pr10,i10_pr11,i10_pr12,i10_pr13,i10_pr14,i10_pr15,i10_pr16,i10_pr17,i10_pr18,i10_pr19,i10_pr20,i10_pr21,i10_pr22,i10_pr23,i10_pr24,i10_pr25,prday1,prday2,prday3,prday4,prday5,prday6,prday7,prday8,prday9,prday10,prday11,prday12,prday13,prday14,prday15,prday16,prday17,prday18,prday19,prday20,prday21,prday22,prday23,prday24,prday25,rehabtransfer,resident,samedayevent,totchg,year,zipinc_qrtl,dxver,prver,preventable_readmission,prev_readm_reason,medical_admission,hfrs_score,hfrs_severity,cc_score,cc_severity,ibd_diagnosis,ibd_index_hospitalization
0,49.0,1.0,0.0,1.492757,1.0,6.0,2.0,386.0,34.0,386.0,K50812,G40909,J449,F17210,F329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,41827.0,3.0,6.0,6.0,5.0,0.0,19296.0,452.0,e0009vi,2.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,11700.0,2017.0,2.0,10.0,10.0,,,1.0,2.0,0.0,1.0,0.0,1,1
1,63.0,0.0,0.0,1.984195,1.0,3.0,1.0,386.0,34.0,386.0,K50912,E039,M1990,K219,I10,R630,F17210,G43909,Z885,E785,Z79891,Z7982,M549,J449,G8929,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,2.0,42416.0,2.0,6.0,6.0,15.0,0.0,13000.0,461.0,e0021xf,1.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,38599.0,2017.0,3.0,10.0,10.0,,,1.0,2.400391,0.0,1.0,0.0,1,1
2,54.0,0.0,0.0,1.389702,1.0,2.0,1.0,389.0,34.0,389.0,K5669,K5010,E663,Z885,Z6826,I10,K5900,F909,F3342,Z883,K5732,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,2.0,43509.0,3.0,6.0,6.0,11.0,0.0,18352.0,441.0,e002hi0,3.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,28192.0,2017.0,4.0,10.0,10.0,,,1.0,1.799805,0.0,0.0,0.0,1,1
3,55.0,0.0,0.0,1.442027,1.0,11.0,4.0,330.0,35.0,330.0,K5720,K660,F3342,K5010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,42947.0,6.0,6.0,6.0,4.0,2.0,18640.0,496.0,e002hi0,3.0,1.0,0T788DZ,0DBN4ZZ,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,103023.0,2017.0,4.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,29.0,0.0,0.0,2.085694,1.0,1.0,1.0,386.0,34.0,386.0,K50012,Z87891,Z9049,F329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,43848.0,2.0,6.0,6.0,4.0,0.0,15488.0,406.0,e002vve,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,3735.0,2017.0,1.0,10.0,10.0,,,1.0,2.0,0.0,0.0,0.0,1,1
5,65.0,0.0,0.0,1.376273,1.0,1.0,1.0,387.0,34.0,387.0,K5100,E876,F419,J45909,Z95810,I10,D649,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,42871.0,3.0,6.0,6.0,7.0,0.0,17856.0,420.0,e003021,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,27488.0,2017.0,4.0,10.0,10.0,,,1.0,2.699219,0.0,2.0,0.0,1,1
6,18.0,0.0,0.0,2.032179,1.0,2.0,1.0,386.0,34.0,386.0,K50111,I2510,E876,E559,K219,K625,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,44141.0,6.0,6.0,6.0,6.0,2.0,20960.0,421.0,e004bp7,3.0,4.0,0DBP8ZX,0DBN8ZX,,,,,,,,,,,,,,,,,,,,,,,,4.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,27523.0,2017.0,3.0,10.0,10.0,,,1.0,3.300781,0.0,0.0,0.0,1,1
7,19.0,0.0,0.0,2.032179,1.0,11.0,4.0,387.0,35.0,387.0,K5090,Z98890,F39,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,44141.0,2.0,6.0,6.0,3.0,0.0,21232.0,421.0,e004bp7,3.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,15025.0,2017.0,2.0,10.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2,0
8,90.0,0.0,0.0,1.358539,5.0,3.0,1.0,92.0,34.0,92.0,G92,K5190,H409,I10,N2889,R4701,F0390,Z954,E559,R269,F05,I482,B958,N390,D751,Z66,F329,I350,Z23,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,2.0,43398.0,4.0,1.0,1.0,19.0,0.0,24912.0,496.0,e004rkq,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,41286.0,2017.0,4.0,10.0,10.0,,,1.0,16.59375,2.0,1.0,0.0,1,1
9,90.0,0.0,0.0,1.358539,5.0,6.0,2.0,480.0,34.0,480.0,S72141A,K5190,I080,I482,E860,N2889,E559,Z66,D751,R2681,F0390,R4701,H409,G9340,E876,Z9181,Y92129,W19XXXA,R482,Z952,Z7982,,,,,,,,,,,,,,,,,,,,0.0,0.0,2.0,43398.0,7.0,8.0,8.0,21.0,1.0,24992.0,496.0,e004rkq,1.0,1.0,0QS604Z,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,135462.0,2017.0,4.0,10.0,10.0,0.0,0.0,0.0,17.703125,2.0,1.0,0.0,2,0


In [14]:
# If there is an IBD related procedure (flexible sigmoidoscopy or colonoscopy, ICD10 codes: 0DJD8ZZ, 0D9.E8ZX, 0D9.H8ZX, 0D9.N8ZX, 0DB.E8ZX, 0DB.H8ZX, 0DB.N8ZX, 0DB.E8ZZ) in columns
# i10_pr1 - i10_pr25 then fill in a 1 in the new 'ibd_related_procedure' column, otherwise fill in a 0.
ibd_index_file['ibd_related_procedures'] = ibd_index_file.filter(like='i10_pr').isin(['0DJD8ZZ', '0D9E8ZX', '0D9H8ZX', '0D9N8ZX', '0DBE8ZX', '0DBH8ZX', '0DBN8ZX', '0DBE8ZZ']).any(axis=1).astype(int)

In [15]:
# Check to see how many rows have an IBD related procedure.
values, counts = np.unique(ibd_index_file['ibd_related_procedures'], return_counts=True)
print('Values:  Counts:')
for number in values:
    print(f'{values[number]}        {counts[number]}')

Values:  Counts:
0        58914
1        7159


In [16]:
# Use pivot_table with aggregate function 'sum' to essentially group by the nrd_visitlinks and sum the ibd_related_procedures for each patient.
pivot_df = ibd_index_file.pivot_table(index='nrd_visitlink', values='ibd_related_procedures', aggfunc='sum').reset_index()

In [17]:
# Print out the number of times each value shows up.
values, counts = np.unique(pivot_df['ibd_related_procedures'], return_counts=True)
print('Values:  Counts:')
for number in values:
    print(f'{values[number]}        {counts[number]}')

Values:  Counts:
0        25415
1        5944
2        477
3        68
4        13
5        1


In [18]:
# Since the pivot table only has unique patients and totals per patient, create a dictionary to store each patient and their total ibd_related_procedures.
patient_totals = {}
for row in pivot_df.itertuples(index=False):
    patient_totals.update({row.nrd_visitlink: row.ibd_related_procedures})

In [19]:
# Double check to make sure we still have the same number of unique patients as before.
len(patient_totals)

31918

In [20]:
# Create a new column called 'total_ibd_related_procedures' by mapping the nrd_visitlinks values from the dicionary patient_totals.
ibd_index_file['total_ibd_related_procedures'] = ibd_index_file['nrd_visitlink'].map(patient_totals)

In [21]:
# Take a quick look at the file.
ibd_index_file

Unnamed: 0,age,aweekend,died,discwt,dispuniform,dmonth,dqtr,drg,drgver,drg_nopoa,i10_dx1,i10_dx2,i10_dx3,i10_dx4,i10_dx5,i10_dx6,i10_dx7,i10_dx8,i10_dx9,i10_dx10,i10_dx11,i10_dx12,i10_dx13,i10_dx14,i10_dx15,i10_dx16,i10_dx17,i10_dx18,i10_dx19,i10_dx20,i10_dx21,i10_dx22,i10_dx23,i10_dx24,i10_dx25,i10_dx26,i10_dx27,i10_dx28,i10_dx29,i10_dx30,i10_dx31,i10_dx32,i10_dx33,i10_dx34,i10_dx35,i10_dx36,i10_dx37,i10_dx38,i10_dx39,i10_dx40,elective,female,hcup_ed,hosp_nrd,los,mdc,mdc_nopoa,i10_ndx,i10_npr,nrd_daystoevent,nrd_stratum,nrd_visitlink,pay1,pl_nchs,i10_pr1,i10_pr2,i10_pr3,i10_pr4,i10_pr5,i10_pr6,i10_pr7,i10_pr8,i10_pr9,i10_pr10,i10_pr11,i10_pr12,i10_pr13,i10_pr14,i10_pr15,i10_pr16,i10_pr17,i10_pr18,i10_pr19,i10_pr20,i10_pr21,i10_pr22,i10_pr23,i10_pr24,i10_pr25,prday1,prday2,prday3,prday4,prday5,prday6,prday7,prday8,prday9,prday10,prday11,prday12,prday13,prday14,prday15,prday16,prday17,prday18,prday19,prday20,prday21,prday22,prday23,prday24,prday25,rehabtransfer,resident,samedayevent,totchg,year,zipinc_qrtl,dxver,prver,preventable_readmission,prev_readm_reason,medical_admission,hfrs_score,hfrs_severity,cc_score,cc_severity,ibd_diagnosis,ibd_index_hospitalization,ibd_related_procedures,total_ibd_related_procedures
0,49.0,1.0,0.0,1.492757,1.0,6.0,2.0,386.0,34.0,386.0,K50812,G40909,J449,F17210,F329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,41827.0,3.0,6.0,6.0,5.0,0.0,19296.0,452.0,e0009vi,2.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,11700.0,2017.0,2.0,10.0,10.0,,,1.0,2.000000,0.0,1.0,0.0,1,1,0,0
1,63.0,0.0,0.0,1.984195,1.0,3.0,1.0,386.0,34.0,386.0,K50912,E039,M1990,K219,I10,R630,F17210,G43909,Z885,E785,Z79891,Z7982,M549,J449,G8929,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,2.0,42416.0,2.0,6.0,6.0,15.0,0.0,13000.0,461.0,e0021xf,1.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,38599.0,2017.0,3.0,10.0,10.0,,,1.0,2.400391,0.0,1.0,0.0,1,1,0,0
2,54.0,0.0,0.0,1.389702,1.0,2.0,1.0,389.0,34.0,389.0,K5669,K5010,E663,Z885,Z6826,I10,K5900,F909,F3342,Z883,K5732,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,2.0,43509.0,3.0,6.0,6.0,11.0,0.0,18352.0,441.0,e002hi0,3.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,28192.0,2017.0,4.0,10.0,10.0,,,1.0,1.799805,0.0,0.0,0.0,1,1,0,0
3,55.0,0.0,0.0,1.442027,1.0,11.0,4.0,330.0,35.0,330.0,K5720,K660,F3342,K5010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,42947.0,6.0,6.0,6.0,4.0,2.0,18640.0,496.0,e002hi0,3.0,1.0,0T788DZ,0DBN4ZZ,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,103023.0,2017.0,4.0,10.0,10.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0,0,0
4,29.0,0.0,0.0,2.085694,1.0,1.0,1.0,386.0,34.0,386.0,K50012,Z87891,Z9049,F329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,43848.0,2.0,6.0,6.0,4.0,0.0,15488.0,406.0,e002vve,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,3735.0,2017.0,1.0,10.0,10.0,,,1.0,2.000000,0.0,0.0,0.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66068,63.0,0.0,0.0,1.380296,1.0,8.0,3.0,194.0,34.0,194.0,J189,T865,R51,Y95,L298,D89810,E222,D469,I959,Z006,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,41084.0,6.0,4.0,4.0,10.0,1.0,15056.0,441.0,ezzyha7,1.0,3.0,0HBJXZX,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,121265.0,2017.0,3.0,10.0,10.0,0.0,0.0,1.0,3.900391,0.0,2.0,0.0,0,0,0,1
66069,63.0,0.0,0.0,1.380296,1.0,11.0,4.0,919.0,35.0,919.0,T865,N170,J069,E872,D469,E8339,D89810,R197,D61818,R7989,K5090,D709,K219,Z006,E039,Z888,Y830,T367X5A,Y92230,G4700,Z9089,Z881,Z9049,Y92039,E806,B348,E785,,,,,,,,,,,,,,0.0,1.0,0.0,41084.0,21.0,21.0,21.0,27.0,5.0,15120.0,441.0,ezzyha7,1.0,3.0,0DB58ZX,0DB98ZX,0DBN8ZX,0DB68ZX,XW03341,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,10.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,594335.0,2017.0,3.0,10.0,10.0,0.0,0.0,1.0,5.101562,1.0,2.0,0.0,0,0,1,1
66070,63.0,0.0,0.0,1.380296,1.0,12.0,4.0,202.0,35.0,202.0,J205,N170,E785,T8609,E871,K5090,D61818,E039,E440,G4700,D89810,B259,D801,Z006,I10,Z6821,Y848,Y92039,T380X5A,H04123,D469,R739,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,41084.0,2.0,4.0,4.0,22.0,0.0,15184.0,441.0,ezzyha7,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,90757.0,2017.0,3.0,10.0,10.0,0.0,0.0,1.0,4.800781,0.0,2.0,0.0,0,0,0,1
66071,54.0,1.0,0.0,1.858673,1.0,2.0,1.0,387.0,34.0,387.0,K51911,I10,K649,K30,K219,D509,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,2.0,42659.0,3.0,6.0,6.0,6.0,3.0,22912.0,435.0,ezzz5ob,1.0,2.0,0DBP8ZX,30233N1,0DBN8ZX,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,35391.0,2017.0,1.0,10.0,10.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,1,1,1,1


In [22]:
# Print out the number of times each value shows up.
values, counts = np.unique(ibd_index_file['total_ibd_related_procedures'], return_counts=True)
print('Values:  Counts:')
for number in values:
    print(f'{values[number]}        {counts[number]}')

Values:  Counts:
0        49338
1        13918
2        2210
3        490
4        109
5        8


In [23]:
# Save the file.
ibd_index_file.to_csv('NRD_2017_Defined_Cohort.csv', index=False)