# Only modify the next cell

In [1]:
# Full path to the .csv file containing the narratives
fn = "/Users/schabdachj/Data/clinical_healthy/tables/rawdata/healthy_mr_cohort_jmy_annotations.csv"

# Phrase indicating non-healthy patients
phrase = "neurofibromatosis"

# Don't modify this bit

In [2]:
from IPython.display import clear_output
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [3]:
def markNarrativeUnhealthy(df, fn, phrase):
    
    count = 0
    
    for idx, row in df.iterrows():
        if phrase in row['narrative_text'].lower() and not type(row['confirm_healthy']) == bool:
            
            validInput = False
            
            while not validInput:
            
                print(row['narrative_text'].lower().replace(phrase, '\033[91m'+phrase.upper()+'\033[0m'))
                print()
                if not type(row['impression_text']) is float:
                    print(row['impression_text'].lower().replace(phrase, '\033[91m'+phrase.upper()+'\033[0m'))
                    print()

                confirm = input("Can you confirm the patient is not healthy (y/n/save/exit)?")

                if confirm.lower() == "y": #or confirm.lower() == "":
                    df['confirm_healthy'].iloc[idx] = False
                    df['reason'].iloc[idx] = phrase
                    clear_output()
                    validInput = True

                elif confirm.lower() == "save":
                    clear_output()
                    print("Saving current dataframe state")
                    df.to_csv(fn)
                    print()
                    validInput = False

                elif confirm.lower() == "n":
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
            
                
                
    print("Finished looking for", phrase)
    return df

In [4]:
def markNarrativeHealthy(df, fn, phrase):
    
    count = 0
    
    iffyPhrases = ["cyst", "cerebellar", "ectopia", "hypointens", "hyperintens",
                   "venous anomaly", "oma", "prolongation", "chiari", "hypoplasia",
                   "hemorrhage"]
    
    for idx, row in df.iterrows():
        if phrase in row['narrative_text'].lower() and not type(row['confirm_healthy']) == bool and ("3 tesla" in row['narrative_text'].lower() or "3.0 tesla" in row['narrative_text'].lower()):
            
            validInput = False
            
            while not validInput:
            
                narr = row['narrative_text'].lower().replace(phrase, '\x1b[5;30;42m' +phrase.upper()+'\x1b[0m')
                
                if not type(row['impression_text']) is float:
                    narr += "\n"+ row['impression_text'].lower().replace(phrase, '\x1b[5;30;42m'+phrase.upper()+'\x1b[0m')
                
                for p in iffyPhrases:
                    narr = narr.replace(p, '\x1b[5;35;44m' + p + '\x1b[0m')
                    
                
                print(narr)
                print()
                
                confirm = input("Can you confirm the patient is healthy (y/n/save/exit)? ")

                if confirm.lower() == "y": #or confirm.lower() == "":
                    df['confirm_healthy'].iloc[idx] = True
                    df['reason'].iloc[idx] = phrase
                    clear_output()
                    validInput = True

                elif confirm.lower() == "save":
                    clear_output()
                    print("Saving current dataframe state")
                    df.to_csv(fn)
                    print()
                    validInput = False

                elif confirm.lower() == "n":
                    reason = input("Why is the patient not healthy? ")
                    df['confirm_healthy'].iloc[idx] = False
                    df['reason'].iloc[idx] = reason
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
            
                
                
    print("Finished looking for", phrase)
    return df

In [67]:
def markNeurofibromatosis(df, fn, phrase):
    
    # First check if the dataframe is prepared to handle neurofibromatosis columns
    if "confirm_neurofibromatosis" not in list(df):
        df['confirm_neurofibromatosis'] = np.nan
    
    if "neurofibromatosis_severity" not in list(df):
        df['neurofibromatosis_severity'] = np.nan
    
    # Initialize variables
    count = 0
    iffyPhrases = ["cyst", "cerebellar", "ectopia", "hypointens", "hyperintens",
                   "venous anomaly", "oma", "prolongation", "chiari", "hypoplasia",
                   "hemorrhage"]
    iffyPhrasesNf = ["hyperintens", "foci", "focus", "spongiform", "hypointens", "prolongation"]
    
    for idx, row in df.iterrows():
        if phrase in row['narrative_text'].lower() and not type(row['confirm_healthy']) == bool and ("3 tesla" in row['narrative_text'].lower() or "3.0 tesla" in row['narrative_text'].lower()):
            
            validInput = False
            
            while not validInput:
            
                narr = row['narrative_text'].lower().replace(phrase, '\x1b[5;30;42m' +phrase.upper()+'\x1b[0m')
                
                if not type(row['impression_text']) is float:
                    narr += "\n"+ row['impression_text'].lower().replace(phrase, '\x1b[5;30;42m'+phrase.upper()+'\x1b[0m')
                
                for p in iffyPhrases:
                    narr = narr.replace(p, '\x1b[6;35;46m' + p + '\x1b[0m')
                    
                for p in iffyPhrasesNf:
                    narr = narr.replace(p, '\x1b[6;37;41m' + p + '\x1b[0m')
                    
                
                print(narr)
                print()
                
                confirm = input("Can you confirm the patient has neurofibromatosis (y/n/save/exit)? ")

                if confirm.lower() == "y": #or confirm.lower() == "":
                    df['confirm_neurofibromatosis'].iloc[idx] = True
                    
                    # Get info about the severity of the pathology
                    while not validInput:
                        healthy = input("Can the patient also be categorized as healthy (y/n)? ")
                        if healthy in ["y", "n"]:
                            validInput = True
                        else:
                            print("Not a valid response.")
                            healthy = input("Can the patient also be categorized as healthy (y/n)? ")
                            
#                         pathology = input("How significant is the pathology (healthy, mild, moderate, or severe)? ")
#                         if pathology not in ['healthy', 'mild', 'moderate', 'severe']:
#                             print("Not a valid response.")
#                             pathology = input("How significant is the pathology (healthy, mild, moderate, or severe)? ")
#                         else:
#                             validInput = True
                            
#                     if pathology == "healthy":
#                         df['confirm_healthy'].iloc[idx] = True
#                         df['reason'].iloc[idx] = phrase+", no pathology"
#                     else:
#                         df['confirm_healthy'].iloc[idx] = False
#                         df['reason'].iloc[idx] = phrase+" with pathology"

                    if healthy == "y":
                        df['confirm_healthy'].iloc[idx] = True
                        df['reason'].iloc[idx] = phrase+", no pathology"
                        df['neurofibromatosis_severity'].iloc[idx] = "healthy"
                    else:
                        df['confirm_healthy'].iloc[idx] = False
                        df['reason'].iloc[idx] = phrase+" with pathology"
                        df['neurofibromatosis_severity'].iloc[idx] = "severe"

#                     df['neurofibromatosis_severity'].iloc[idx] = pathology
                    validInput = True
                    clear_output()

                elif confirm.lower() == "save":
                    clear_output()
                    print("Saving current dataframe state")
                    df.to_csv(fn)
                    print()
                    validInput = False

                elif confirm.lower() == "n":
                    df['confirm_neurofibromatosis'].iloc[idx] = False
                    
                    while not validInput:
                        healthy = input("Can the patient be categorized as healthy (y/n)? ")
                        if healthy in ["y", "n"]:
                            validInput = True
                        else:
                            print("Not a valid response.")
                            healthy = input("Can the patient be categorized as healthy (y/n)? ")

                    if healthy == "y":
                        df['confirm_healthy'].iloc[idx] = True
                        df['reason'].iloc[idx] = phrase+", no pathology"
                        df['neurofibromatosis_severity'].iloc[idx] = "healthy"
                    else:
                        reason = input("Why is the patient not healthy? ")
                        df['confirm_healthy'].iloc[idx] = False
                        df['reason'].iloc[idx] = reason
                        df['neurofibromatosis_severity'].iloc[idx] = "severe"
                        
                    clear_output()
                    validInput = True

                elif confirm.lower() == "exit":
                    clear_output()
                    print("Returning current dataframe state")
                    return df
                
                else:
                    print("Not a valid response")
                    print()
                
                
            count += 1
            
            if count % 100 == 0:
                print("Holy cow, that's "+str(count)+" narratives! Drink some water and look out the window for a moment.")
                print()
            
            elif count % 10 == 0:
                print("Woo! You've annotated "+str(count)+" narratives!")
                print()
                
            
                
                
    print("Finished looking for", phrase)
    return df

In [81]:
df = pd.read_csv(fn)
if 'Unnamed: 0' in list(df):
    df = df.drop(columns=['Unnamed: 0'])
    
if "confirm_healthy" not in list(df):
    df['confirm_healthy'] = np.nan
    
if "reason" not in list(df):
    df['reason'] = np.nan
    
    
# Adding a filter for only the unremarkables
# df = df[(df['narrative_text'].str.lower().str.contains("unremarkable", na=False) | df['impression_text'].str.lower().str.contains("unremarkable", na=False))]
    
print("Loaded the dataframe")
print()
print("Number of narratives:", df.shape)
print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape)
print("Number of narratives containing \""+phrase+"\":", df[df['narrative_text'].str.lower().str.contains(phrase)].shape[0])
print("Number of unannotated narratives containing \""+phrase+"\":", df[(df['narrative_text'].str.lower().str.contains(phrase)) & df['confirm_healthy'].isnull()].shape[0])

# print()
# print("Only want to look at 3T scans for now:")
# df = df[df['narrative_text'].str.lower().str.contains("3 tesla|3.0 tesla")]
# print(df.shape)

Loaded the dataframe

Number of narratives: (120009, 14)
Number of unannotated narratives: (85980, 14)
Number of narratives containing "neurofibromatosis": 7040
Number of unannotated narratives containing "neurofibromatosis": 3599


# Interactive Part

In [83]:
workingOn = "neurofibromatosis"

print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+phrase+"\":", df[df['narrative_text'].str.lower().str.contains(phrase)].shape[0])
print("Number of unannotated narratives containing \""+phrase+"\":", df[(df['narrative_text'].str.lower().str.contains(phrase)) & df['confirm_healthy'].isnull()].shape[0])
print()

if workingOn == "unhealthy":
    df = markNarrativeUnhealthy(df, fn, phrase)
elif workingOn == "healthy":
    df = markNarrativeHealthy(df, fn, phrase)
elif workingOn == "neurofibromatosis":
    df = markNeurofibromatosis(df, fn, phrase)

print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+phrase+"\":", df[df['narrative_text'].str.lower().str.contains(phrase)].shape[0])
print("Number of unannotated narratives containing \""+phrase+"\":", df[(df['narrative_text'].str.lower().str.contains(phrase)) & df['confirm_healthy'].isnull()].shape[0])
print()

df.to_csv(fn)

Returning current dataframe state
Number of unannotated narratives: 85881
Number of narratives containing "neurofibromatosis": 7040
Number of unannotated narratives containing "neurofibromatosis": 3500



In [36]:
# Drop all narratives with occurrence of string

absolutePhrase = "percentile"

print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
print("Number of narratives containing \""+absolutePhrase+"\":", df[df['narrative_text'].str.lower().str.contains(absolutePhrase)].shape[0])
print("Number of unannotated narratives containing \""+absolutePhrase+"\":", df[(df['narrative_text'].str.lower().str.contains(absolutePhrase)) & df['confirm_healthy'].isnull()].shape[0])
print()



# def markBatch(df, absolutePhrase, ):
#     for idx, row in df.iterrows():
#         if absolutePhrase in row['narrative_text'].lower():
#             df['confirm_healthy'].iloc[idx] = False
#             df['reason'].iloc[idx] = absolutePhrase
            
#     print("Number of unannotated narratives:", df[df['confirm_healthy'].isnull()].shape[0])
#     print("Number of narratives containing \""+absolutePhrase+"\":", df[df['narrative_text'].str.lower().str.contains(absolutePhrase)].shape[0])
#     print("Number of unannotated narratives containing \""+absolutePhrase+"\":", df[(df['narrative_text'].str.lower().str.contains(absolutePhrase)) & df['confirm_healthy'].isnull()].shape[0])
#     print()

#     return df

        
# df = batchMarkNotHealthy(df, absolutePhrase)

Number of unannotated narratives: 87174
Number of narratives containing "percentile": 3036
Number of unannotated narratives containing "percentile": 2721



In [None]:
# tmpDf = df[df['narrative_text'].str.lower().str.contains("unremarkable brain")]

# print("unremarkable:", tmpDf.shape[0])
print("healthy:", df[(df['confirm_healthy'] == True) & (df['narrative_text'].str.lower().str.contains("unremarkable brain"))].shape[0])
print("not healthy:", df[(df['confirm_healthy'] == False) & (df['narrative_text'].str.lower().str.contains("unremarkable brain"))].shape[0])

In [66]:
print('\x1b[5;30;42m' + 'Success!' + '\x1b[0m')
print('\x1b[6;35;46m' + 'Success!' + '\x1b[0m')
print('\x1b[6;37;41m' + 'Success!' + '\x1b[0m')


[5;30;42mSuccess![0m
[6;35;46mSuccess![0m
[6;37;41mSuccess![0m


In [None]:
# # Fixing an oops

# smolDf = pd.read_csv("/Users/schabdachj/Data/healthy_mr_cohort_jmy_annotations.csv")
# if "Unnamed: 0" in list(smolDf):
#     smolDf = smolDf.drop(columns={"Unnamed: 0"})
# print(smolDf.shape)
# print(smolDf[(smolDf['narrative_text'].str.lower().str.contains("unremarkable brain")) & (smolDf['confirm_healthy'] == True)].shape)

# test = smolDf.combine_first(df)
# print(test.shape)
# print(test[(test['narrative_text'].str.lower().str.contains("unremarkable brain")) & (test['narrative_text'].str.lower().str.contains("3 tesla|3.0 tesla")) & (test['confirm_healthy'] == True)].shape)

In [13]:
df['confirm_neurofibromatosis'] = np.nan

In [None]:
# Looking at potential neurofibromatosis patients...
import matplotlib.pyplot as plt

ages = df[df['narrative_text'].str.lower().str.contains("neurofibromatosis|nf i|nf 1|nf1")]['proc_ord_age']
# ages = ages/365.25
ids = df[(df['narrative_text'].str.lower().str.contains("neurofibromatosis|nf i|nf 1|nf1")) & (df['proc_ord_age'] < 3652.5)]['pat_id']
ids = ids.drop_duplicates()
print(ids.shape)
ages = df[(df['narrative_text'].str.lower().str.contains("neurofibromatosis|nf i|nf 1|nf1")) & (df['proc_ord_age'] < 3652.5)]['proc_ord_age']
# ages = df[(df['proc_ord_age'] < 3652.5)]['proc_ord_age']


print(min(ages))
print(max(ages))
print(len(ages))

fig = plt.figure()
fig.patch.set_facecolor('w')
plt.hist(ages)
plt.ylabel("Count")
plt.xlabel("Age in days")
plt.title("Histogram of Patient Ages under 10 Years")