# Only modify the next cell

In [1]:
# Full path to the .csv file containing the narratives
fn = "/Users/schabdachj/Data/clip/tables/rawdata/2021-06-17_clip_request_01.csv"

# Phrase indicating non-healthy patients
phrase = ["unremarkable"]

In [2]:
print(phrase)

['unremarkable']


# Don't modify this bit

In [3]:
from IPython.display import clear_output
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [4]:
def safelySaveDf(df, fn):
    try:
        df = df.astype(str)
        df.to_csv(fn)
        return True
    except PermissionError:
        print("Error: write access to "+fn+" denied. Please check that the file is not locked by Datalad.")
        return False

In [5]:
def markRedText(line, toMark):
    start = '\x1b[5;30;41m' # red background, bold black text
    end = '\x1b[0m'
    
    if type(toMark) == str:
        line = line.replace(toMark, start+toMark+end)
        
    elif type(toMark) == list:
        for phrase in toMark:
            line = line.replace(str(phrase), start+str(phrase)+end)
    
    else:
        print("Error: the second argument must be either a string or a list of strings")
        
    return line

        
def markGreenText(line, toMark):    
    start = '\x1b[5;30;42m' # green background, bold black text
    end = '\x1b[0m'
    
    if type(toMark) == str:
        line = line.replace(toMark, start+toMark+end)
        
    elif type(toMark) == list:
        for phrase in toMark:
            line = line.replace(str(phrase), start+str(phrase).upper()+end)
    
    else:
        print("Error: the second argument must be either a string or a list of strings")
        
    return line


def markYellowText(line, toMark):
    start = '\x1b[5;30;43m' # yellow background, bold black text
    end = '\x1b[0m'
    
    if type(toMark) == str:
        line = line.replace(toMark, start+toMark+end)
        
    elif type(toMark) == list:
        for phrase in toMark:
            line = line.replace(str(phrase), start+str(phrase)+end)
    
    else:
        print("Error: the second argument must be either a string or a list of strings")
        
    return line


In [6]:
# def markBatch(df, absolutePhrase, ):
#     for idx, row in df.iterrows():
#         if absolutePhrase in row['narrative_text'].lower():
#             df['confirm_healthy'].iloc[idx] = False
#             df['reason'].iloc[idx] = absolutePhrase
            
#     print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
#     print("Number of narratives containing \""+absolutePhrase+"\":", df[df['narrative_text'].str.lower().str.contains(absolutePhrase)].shape[0])
#     print("Number of unannotated narratives containing \""+absolutePhrase+"\":", df[(df['narrative_text'].str.lower().str.contains(absolutePhrase)) & df['confirm_healthy'].isnull()].shape[0])
#     print()

#     return df

In [7]:
def markNarrativeUnhealthy(df, fn, phrase):
    
    count = 0
    
    for idx, row in df.iterrows():
        if phrase in row['narrative_text'].lower() and not type(row['confirm_healthy']) == bool:
            
            validInput = False
            
            while not validInput:
            
                print(row['narrative_text'].lower().replace(phrase, '\033[91m'+phrase.upper()+'\033[0m'))
                print()
                if not type(row['impression_text']) is float:
                    print(row['impression_text'].lower().replace(phrase, '\033[91m'+phrase.upper()+'\033[0m'))
                    print()

                confirm = input("Can you confirm the patient is not healthy (y/n/save/exit)?")

                if confirm.lower() == "y": #or confirm.lower() == "":
                    df['confirm_healthy'].iloc[idx] = False
                    df['reason'].iloc[idx] = phrase
                    clear_output()
                    validInput = True

                elif confirm.lower() == "save":
                    clear_output()
                    safelySaveDf(df, fn)
                    validInput = False

                elif confirm.lower() == "n":
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
            
                
                
    print("Finished looking for", phrase)
    return df

In [8]:
def markNarrativeHealthy(df, fn, searchPhrase):
    
    count = 0
    
    iffyPhrases = ["cyst", "cerebellar", "ectopia", "hypointens", "hyperintens",
                   "venous anomaly", "oma", "prolongation", "chiari", "hypoplasia",
                   "hemorrhage", "optic glioma", "mass"]
    badPhrases = []
    
    if type(searchPhrase) == str:
        reason = searchPhrase
    elif type(searchPhrase) == list:
        reason = " ".join(searchPhrase)
    
    for idx, row in df.iterrows():
        # Get the narrative and impression text to search for that row
        narr = row['narrative_text'].lower()
                
        if not type(row['impression_text']) is float:
            narr += "\n\n"+ row['impression_text'].lower()
                
        
        # First check that the searchPhrase is in the string
        if type(searchPhrase) == str:
            # Assume the phrase is not in the string, set to True if there
            phrasePresent = False
            if searchPhrase in narr:
                phrasePresent = True
                
        elif type(searchPhrase) == list:
            # Assume the phrases are all in the string, set to False if any missing
            phrasePresent = True
            for phrase in searchPhrase:
                if phrase not in narr:
                    phrasePresent = False
                    
        else:
            print("Error: the searchPhrase parameter must be a string or a list.")
            return -1
        
        if phrasePresent and type(row['confirm_healthy']) == float:
            
            # Next chunk: annotate the row
            validInput = False
            
            while not validInput:
                
                # Format the text and impression for printing
                narr = markGreenText(narr, searchPhrase)
                narr = markYellowText(narr, iffyPhrases)
                narr = markRedText(narr, badPhrases)
                
                print(narr)
                print()
                
                confirm = input("Can you confirm the patient is healthy (y/n/save/exit)? ")

                if confirm.lower() == "y": 
                    df['confirm_healthy'].iloc[idx] = True
                    reasonYes = input("Why is the patient healthy? ")
                    df['reason'].iloc[idx] = reasonYes
                    clear_output()
                    validInput = True

                elif confirm.lower() == "save":
                    clear_output()
                    print("Saving current dataframe state")
                    safelySaveDf(df, fn)
                    print()
                    validInput = False

                elif confirm.lower() == "n":
                    reasonNo = input("Why is the patient not healthy? ")
                    df['confirm_healthy'].iloc[idx] = False
                    df['reason'].iloc[idx] = reasonNo
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
                
    print("Finished looking for", str(searchPhrase))
    return df

In [9]:
def markNeurofibromatosis(df, fn, searchPhrase):
    
    # First check if the dataframe is prepared to handle neurofibromatosis columns
    if "confirm_neurofibromatosis" not in list(df):
        df['confirm_neurofibromatosis'] = np.nan
    
    if "neurofibromatosis_severity" not in list(df):
        df['neurofibromatosis_severity'] = np.nan
    
    # Initialize variables
    count = 0
    badPhrases = ["cyst", "cerebellar", "ectopia", "hypointens", "hyperintens",
                   "venous anomaly", "oma", "prolongation", "chiari", "hypoplasia",
                   "hemorrhage", "neoplasm", "lesion", "mass"]
    iffyPhrases = ["hyperintens", "foci", "focus", "spongiform", "hypointens", "prolongation", "signal", "optic glioma"]
    
    
    if type(searchPhrase) == str:
        reason = searchPhrase
    elif type(searchPhrase) == list:
        reason = " ".join(searchPhrase)
    
    for idx, row in df.iterrows():
        # Get the narrative and impression text to search for that row
        narr = row['narrative_text'].lower()
                
        if not type(row['impression_text']) is float:
            narr += "\n\n"+ row['impression_text'].lower()
                
        
        # First check that the searchPhrase is in the string
        if type(searchPhrase) == str:
            # Assume the phrase is not in the string, set to True if there
            phrasePresent = False
            if searchPhrase in narr:
                phrasePresent = True
                
        elif type(searchPhrase) == list:
            # Assume the phrases are all in the string, set to False if any missing
            phrasePresent = True
            for phrase in searchPhrase:
                if phrase not in narr:
                    phrasePresent = False
                    
        else:
            print("Error: the searchPhrase parameter must be a string or a list.")
            return -1
        
        if phrasePresent and type(row['confirm_healthy']) == float: #and row['confirm_neurofibromatosis'] != False:
    
            validInput = False
            
            print(row['neurofibromatosis_severity'])
            if row['neurofibromatosis_severity'] in ['healthy', 'mild']:
                validInput = True
                count = count - 1
            
            while not validInput:
            
                # Format the text and impression for printing
                narr = markGreenText(narr, searchPhrase)
                narr = markYellowText(narr, iffyPhrases)
                narr = markRedText(narr, badPhrases)
                
                print(narr)
                print()
                
                confirm = input("Can you confirm the patient has neurofibromatosis (y/n/save/skip/exit)? ")

                if confirm.lower() == "y": #or confirm.lower() == "":
                    df['confirm_neurofibromatosis'].iloc[idx] = True
                    
                    # Get info about the severity of the pathology
                    while not validInput:
                        pathology = input("How significant is the pathology (healthy, mild, moderate, or severe)? ")
                        if pathology not in ['healthy', 'mild', 'moderate', 'severe']:
                            print("Not a valid response.")
                        else:
                            validInput = True
                            
                    if pathology == "healthy":
                        df['confirm_healthy'].iloc[idx] = True
                        df['reason'].iloc[idx] = "neurofibromatosis, no pathology"
                    else:
                        df['confirm_healthy'].iloc[idx] = False
                        df['reason'].iloc[idx] = "neurofibromatosis with pathology"

                    df['neurofibromatosis_severity'].iloc[idx] = pathology
                    validInput = True
                    clear_output()

                elif confirm.lower() == "save":
                    clear_output()
                    print("Saving current dataframe state")
                    saveSuccessful = safelySaveDf(df, fn)
                    validInput = False

                elif confirm.lower() == "n":
                    df['confirm_neurofibromatosis'].iloc[idx] = False
                        
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                elif confirm.lower() == "skip":
                    clear_output
                    validInput = True
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
            
                
                
    print("Finished looking for NF1")
    return df

In [10]:
dataDf = pd.read_csv(fn)

if 'Unnamed: 0' in list(dataDf):
    dataDf = dataDf.drop(columns=['Unnamed: 0'])
    
if "confirm_healthy" not in list(dataDf):
    dataDf['confirm_healthy'] = np.nan
    
if "reason" not in list(dataDf):
    dataDf['reason'] = np.nan
    
dataDf['proc_ord_id'] = dataDf['proc_ord_id'].astype(str)
print(type(dataDf['proc_ord_id'].values[0]))
print(dataDf['proc_ord_id'].values[0])

print("Loaded the dataframe")
print()
print(list(dataDf))
print()
print("Number of narratives:", dataDf.shape)
print("Number of unannotated narratives:", dataDf[dataDf['confirm_healthy'].isnull()].shape)
print("Number of narratives containing \""+phrase[0]+"\":", dataDf[dataDf['narrative_text'].str.lower().str.contains(phrase[0])].shape[0])
print("Number of unannotated narratives containing \""+phrase[0]+"\":", dataDf[(dataDf['narrative_text'].str.lower().str.contains(phrase[0])) & dataDf['confirm_healthy'].isnull()].shape[0])

<class 'str'>
66168715881
Loaded the dataframe

['pat_id', 'proc_ord_id', 'proc_name', 'cpt_code', 'modality', 'body_region', 'narrative_text', 'impression_text', 'proc_year', 'proc_ord_age', 'combo_id', 'confirm_healthy', 'reason', 'confirm_neurofibromatosis', 'neurofibromatosis_severity']

Number of narratives: (120040, 15)
Number of unannotated narratives: (85647, 15)
Number of narratives containing "unremarkable": 43820
Number of unannotated narratives containing "unremarkable": 31572


# Interactive Part

In [None]:
workingOn = "healthy"

print("Number of unannotated narratives:", dataDf[dataDf['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+phrase[0]+"\":", dataDf[dataDf['narrative_text'].str.lower().str.contains(phrase[0])].shape[0])
print("Number of unannotated narratives containing \""+phrase[0]+"\":", dataDf[(dataDf['narrative_text'].str.lower().str.contains(phrase[0])) & dataDf['confirm_healthy'].isnull()].shape[0])
print()

if workingOn == "unhealthy":
    dataDf = markNarrativeUnhealthy(dataDf, fn, phrase)
elif workingOn == "healthy":
    dataDf = markNarrativeHealthy(dataDf, fn, phrase)
elif workingOn == "neurofibromatosis":
    dataDf = markNeurofibromatosis(dataDf, fn, phrase)

print("Number of unannotated narratives:", dataDf[dataDf['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+phrase[0]+"\":", dataDf[dataDf['narrative_text'].str.lower().str.contains(phrase[0])].shape[0])
print("Number of unannotated narratives containing \""+phrase[0]+"\":", dataDf[(dataDf['narrative_text'].str.lower().str.contains(phrase[0])) & dataDf['confirm_healthy'].isnull()].shape[0])
print()

safelySaveDf(dataDf, fn)

Number of unannotated narratives: 85647
Number of narratives containing "unremarkable": 43820
Number of unannotated narratives containing "unremarkable": 31572

technique:  sagittal t1 axial t2 axial flair coronal flair axial diffusion axial mt axial coronal and sagittal post gad mt. the posterior fossa is [5;30;42mUNREMARKABLE[0m. the ventricles are normal in size and shape without midline shift. there is no intra-axial mas or extra-axial collection. there is no abnormal enhancement. normal cerebrovascular flow-voids are visualized.  the orbits and mastoid air cells are grossly [5;30;42mUNREMARKABLE[0m. impression: [5;30;42mUNREMARKABLE[0m mri of the head. 70553 183 end of impression:



In [None]:
# Drop all narratives with occurrence of string

absolutePhrase = "percentile"

print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+absolutePhrase+"\":", df[df['narrative_text'].str.lower().str.contains(absolutePhrase)].shape[0])
print("Number of unannotated narratives containing \""+absolutePhrase+"\":", df[(df['narrative_text'].str.lower().str.contains(absolutePhrase)) & df['confirm_healthy'].isnull()].shape[0])
print()
        
# df = batchMarkNotHealthy(df, absolutePhrase)

In [32]:
condition = "confirm_neurofibromatosis"
status = True

tmpDf = dataDf[dataDf[condition] == status]

print(condition, status, "session count:", tmpDf.shape[0])
print(condition, status, "subject count:", len(list(set(tmpDf['pat_id']))))

print(condition, status, "healthy:", tmpDf[tmpDf['confirm_healthy'] == True].shape[0])

print(tmpDf[tmpDf['narrative_text'].str.lower().str.contains("type i |type 1|nf1|nfi |nf 1|nf i | neurofibromatosis 1| neurofibromatosis i ")].shape[0])
tmp2 = tmpDf[tmpDf['narrative_text'].str.lower().str.contains("type i |type 1|nf1|nfi |nf 1|nf i | neurofibromatosis 1| neurofibromatosis i ")]
print(len(list(set(tmp2['pat_id']))))
print(tmpDf[tmpDf['narrative_text'].str.lower().str.contains("type ii |type 2|nf2|nfii |nf 2|nf ii| neurofibromatosis 2| neurofibromatosis ii ")].shape[0])
print(tmpDf[tmpDf['narrative_text'].str.lower().str.contains("schwannoma")].shape[0])
print(tmpDf[~tmpDf['narrative_text'].str.lower().str.contains("type ii |type 2|nf2|nfii |nf 2|nf ii| neurofibromatosis 1| neurofibromatosis i | neurofibromatosis 2| neurofibromatosis ii |type i |type 1|nf1|nfi |nf 1|nf i ")].shape[0])


confirm_neurofibromatosis True session count: 1480
confirm_neurofibromatosis True subject count: 289
confirm_neurofibromatosis True healthy: 60
1240
243
26
37
214


In [71]:
# Convert this/the printing bits into a function that gets called in different functions
print('\x1b[5;30;42m' + 'Success!' + '\x1b[0m')
print('\x1b[5;30;43m' + 'Success!' + '\x1b[0m')
print('\x1b[5;30;41m' + 'Success!' + '\x1b[0m')


[5;30;42mSuccess![0m
[5;30;43mSuccess![0m
[5;30;41mSuccess![0m


In [None]:
# # Fixing an oops

# smolDf = pd.read_csv("/Users/schabdachj/Data/healthy_mr_cohort_jmy_annotations.csv")
# if "Unnamed: 0" in list(smolDf):
#     smolDf = smolDf.drop(columns={"Unnamed: 0"})
# print(smolDf.shape)
# print(smolDf[(smolDf['narrative_text'].str.lower().str.contains("unremarkable brain")) & (smolDf['confirm_healthy'] == True)].shape)

# test = smolDf.combine_first(df)
# print(test.shape)
# print(test[(test['narrative_text'].str.lower().str.contains("unremarkable brain")) & (test['narrative_text'].str.lower().str.contains("3 tesla|3.0 tesla")) & (test['confirm_healthy'] == True)].shape)

In [None]:
# Looking at potential neurofibromatosis patients...
import matplotlib.pyplot as plt

ages = df[df['narrative_text'].str.lower().str.contains("neurofibromatosis i||nf i|nf 1|nf1")]['proc_ord_age']
# ages = ages/365.25
ids = df[(df['narrative_text'].str.lower().str.contains("neurofibromatosis|nf i|nf 1|nf1")) & (df['proc_ord_age'] < 3652.5)]['pat_id']
ids = ids.drop_duplicates()
print(ids.shape)
ages = df[(df['narrative_text'].str.lower().str.contains("neurofibromatosis|nf i|nf 1|nf1")) & (df['proc_ord_age'] < 3652.5)]['proc_ord_age']
# ages = df[(df['proc_ord_age'] < 3652.5)]['proc_ord_age']


print(min(ages))
print(max(ages))
print(len(ages))

fig = plt.figure()
fig.patch.set_facecolor('w')
plt.hist(ages)
plt.ylabel("Count")
plt.xlabel("Age in days")
plt.title("Histogram of Patient Ages under 10 Years")

In [23]:
safelySaveDf(dataDf, fn)

True