In [1]:
import csv
import pandas as pd
import re

In [2]:
df = pd.read_csv('output.csv')

In [3]:
df

Unnamed: 0,subject_id,study_id,path,content
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...
...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...


# START HERE

## cleaning/rearranging

In [4]:
df["content"][0]

'                                 FINAL REPORT\n EXAMINATION:  CHEST (PA AND LAT)\n \n INDICATION:  ___F with new onset ascites  // eval for infection\n \n TECHNIQUE:  Chest PA and lateral\n \n COMPARISON:  None.\n \n FINDINGS: \n \n There is no focal consolidation, pleural effusion or pneumothorax.  Bilateral\n nodular opacities that most likely represent nipple shadows. The\n cardiomediastinal silhouette is normal.  Clips project over the left lung,\n potentially within the breast. The imaged upper abdomen is unremarkable.\n Chronic deformity of the posterior left sixth and seventh ribs are noted.\n \n IMPRESSION: \n \n No acute cardiopulmonary process.\n'

In [5]:
def clean_text(text):
    #replace unwanted chars and whitespace
    cleaned_text = re.sub(r"[\n\r\t]+|___|//", " ", text)
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    
    return cleaned_text.strip()



In [6]:
df['cleaned_content'] = df['content'].apply(clean_text)

In [7]:
df["cleaned_content"][0]

'FINAL REPORT EXAMINATION: CHEST (PA AND LAT) INDICATION: F with new onset ascites eval for infection TECHNIQUE: Chest PA and lateral COMPARISON: None. FINDINGS: There is no focal consolidation, pleural effusion or pneumothorax. Bilateral nodular opacities that most likely represent nipple shadows. The cardiomediastinal silhouette is normal. Clips project over the left lung, potentially within the breast. The imaged upper abdomen is unremarkable. Chronic deformity of the posterior left sixth and seventh ribs are noted. IMPRESSION: No acute cardiopulmonary process.'

In [8]:
def remove_final_report(text):
    # regex patterns to drop cause no value
    pattern = r"FINAL REPORT|CHEST RADIOGRAPH PERFORMED ON|PORTABLE CHEST OF|PORTABLE AP CHEST X-RAY"
    cleaned_text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    
    return cleaned_text.strip()



In [9]:
df['cleaned_content'] = df['cleaned_content'].apply(remove_final_report)

In [10]:
def clean_comparison(df):
    #edit messed up important labels
    df['cleaned_content'] = df['cleaned_content'].str.replace('OMPARISON', 'COMPARISON')
    df['cleaned_content'] = df['cleaned_content'].str.replace('COMPARISONS', 'COMPARISON')
    df['cleaned_content'] = df['cleaned_content'].str.replace('INDCATION', 'INDICATION')
    df['cleaned_content'] = df['cleaned_content'].str.replace('IDICATION', 'INDICATION')
    return df


In [11]:
clean_comparison(df)

Unnamed: 0,subject_id,study_id,path,content,cleaned_content
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: F ...
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...,EXAMINATION: CHEST (PORTABLE AP) INDICATION: F...
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: M ...
...,...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio..."
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...,INDICATION: -year-old with chest pain. TECHNIQ...
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,CCOMPARISON: radiograph. FINDINGS: There has b...
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,CCOMPARISON: Prior chest radiograph from earli...


In [12]:
def clean_up_comparison(text):
    # fix the double c
    pattern = r"CCOMPARISON"
    cleaned_text = re.sub(pattern, "COMPARISON", text, flags=re.IGNORECASE)
    return cleaned_text.strip()

In [13]:
df['cleaned_content'] = df['cleaned_content'].apply(clean_up_comparison)

In [14]:
df["cleaned_content"][0]

'EXAMINATION: CHEST (PA AND LAT) INDICATION: F with new onset ascites eval for infection TECHNIQUE: Chest PA and lateral COMPARISON: None. FINDINGS: There is no focal consolidation, pleural effusion or pneumothorax. Bilateral nodular opacities that most likely represent nipple shadows. The cardiomediastinal silhouette is normal. Clips project over the left lung, potentially within the breast. The imaged upper abdomen is unremarkable. Chronic deformity of the posterior left sixth and seventh ribs are noted. IMPRESSION: No acute cardiopulmonary process.'

In [15]:
df['cleaned_content'][227833]

'COMPARISON: Prior chest radiograph from earlier same day. CLINICAL HISTORY: Transfer from outside hospital with intubation, assess position of tube. FINDINGS: Portable supine AP view of the chest provided demonstrates an endotracheal tube with tip positioned approximately 3.5 cm above the carina. The NG tube courses into the left upper abdomen. There is bibasilar atelectasis. Heart and mediastinal contour appears grossly unremarkable. The bony structures appear intact. IMPRESSION: Appropriately positioned ET and NG tubes. Bibasilar atelectasis.'

In [16]:
df['cleaned_content'][227832]

'COMPARISON: radiograph. FINDINGS: There has been interval extubation and improved lung volumes compared to the recent radiograph. Bibasilar atelectasis has nearly resolved with residual patchy atelectasis remaining in the right lower lobe and only minimal residual linear atelectasis in the left lower lobe. Apparent rightward deviation of the trachea is likely due to mild patient rotation and curvature of the spine, as there is no evidence of a discrete paratracheal mass on recent neck CTA of . Cardiac silhouette is stable in size. No pleural effusion or pneumothorax.'

In [17]:
df['cleaned_content'][227830]

'INDICATION: Patient with intubation, evaluation for ETT placement. COMPARISON: None. FINDINGS: ET tube ends 4.7 cm above the carina. NG tube is in the stomach. The lungs are otherwise clear. Elevation of right hemidiaphragm is mild. There is no pneumothorax or pleural effusion. Mediastinal and cardiac contours are normal. CONCLUSION: 1. Lines and tubes are in adequate position. 2. The rest of the exam is unremarkable.'

In [18]:
df['content'][227830]

'                                 FINAL REPORT\n PORTABLE AP CHEST X-RAY\n \n INDICATION:  Patient with intubation, evaluation for ETT placement.\n \n COMPARISON:  None.\n \n FINDINGS:\n \n ET tube ends 4.7 cm above the carina.  NG tube is in the stomach.  The lungs\n are otherwise clear.  Elevation of right hemidiaphragm is mild.  There is no\n pneumothorax or pleural effusion.  Mediastinal and cardiac contours are\n normal.\n \n CONCLUSION:\n \n 1.  Lines and tubes are in adequate position.\n \n 2.  The rest of the exam is unremarkable.\n'

In [19]:
headers = []
for content in df['cleaned_content']:
    matches = re.findall(r'^([A-Z][A-Z\s]+:)', content, re.MULTILINE)
    headers.extend(matches)

unique_headers = list(set(headers))

In [20]:
unique_headers

['PA AND LATERAL HISTORY:',
 'PORTABLE CHEST RADIOGRAPH DATED COMPARISON:',
 'PA AND LATERAL CHEST FILM AT INDICATION:',
 'FINAL ADDENDUM HISTORY:',
 'PORTABLE CHEST RADIOGRAPH WITH COMPARISON RADIOGRAPH FINDINGS:',
 'PA AND LATERAL CHEST FROM ON HISTORY:',
 'CHEST RADIOGRAPH DATED COMPARISON:',
 'REASON FOR EXAMINATION:',
 'PA AND LATERAL CHEST INDICATION:',
 'CHEST ON AT HISTORY:',
 'PA AND LATERAL CHEST RADIOGRAPH DATED WITH NO PRIOR RADIOGRAPHS FOR COMPARISON FINDINGS:',
 'PA AND LATERAL CHEST RADIOGRAPHS INDICATION:',
 'PORTABLE SUPINE CHEST DATED COMPARISON:',
 'FINDINGS:',
 'AP CHEST :',
 'FINAL ADDENDUM ADDENDUM COMPARISON:',
 'CHEST RADIOGRAPH:',
 'SINGLE PORTABLE VIEW OF THE CHEST:',
 'INDICATIONS:',
 'PORTABLE CHEST :',
 'CHEST PERFORMED ON COMPARISON:',
 'PORTABLE UPRIGHT CHEST RADIOGRAPH CLINICAL HISTORY:',
 'PA AND LATERAL CHEST FROM AT CLINICAL INDICATION:',
 'FINAL ADDENDUM PLEASE NOTE:',
 'SINGLE FRONTAL VIEW OF THE CHEST:',
 'CHEST ON HISTORY:',
 'PORTABLE CHEST WITH 

In [21]:
len(unique_headers)

306

# extract headers for each row

In [22]:
df

Unnamed: 0,subject_id,study_id,path,content,cleaned_content
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: F ...
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...,EXAMINATION: CHEST (PORTABLE AP) INDICATION: F...
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: M ...
...,...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio..."
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...,INDICATION: -year-old with chest pain. TECHNIQ...
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,COMPARISON: radiograph. FINDINGS: There has be...
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,COMPARISON: Prior chest radiograph from earlie...


In [24]:
df2=df

In [25]:
# expression pattern to match headers
header_pattern = re.compile(r'([A-Z][A-Z ]+:)')
headers = []

for content in df['cleaned_content']:
    # find all matches of header_pattern
    matches = header_pattern.findall(content)
    if matches:
        # if found, append 
        headers.append(matches)
    else:
        # if no found, append an empty list
        headers.append([])
df['headers'] = headers


In [26]:
df

Unnamed: 0,subject_id,study_id,path,content,cleaned_content,headers
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: F ...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR..."
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR..."
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...,EXAMINATION: CHEST (PORTABLE AP) INDICATION: F...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI..."
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING..."
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: M ...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI..."
...,...,...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio...","[INDICATION:, COMPARISON:, FINDINGS:, CONCLUSI..."
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...,INDICATION: -year-old with chest pain. TECHNIQ...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING..."
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,COMPARISON: radiograph. FINDINGS: There has be...,"[COMPARISON:, FINDINGS:]"
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,COMPARISON: Prior chest radiograph from earlie...,"[COMPARISON:, CLINICAL HISTORY:, FINDINGS:, IM..."


# Add headers columns

In [27]:
header_columns = {}
for header in unique_headers:
    header_columns[header] = []

# Extract and populate data for each header column
for index, row in df2.iterrows():
    for header in unique_headers:
        data = ""
        header_columns[header].append(data)

new_df = pd.DataFrame(header_columns)
df = pd.concat([df, new_df], axis=1)


In [28]:
df

Unnamed: 0,subject_id,study_id,path,content,cleaned_content,headers,PA AND LATERAL HISTORY:,PORTABLE CHEST RADIOGRAPH DATED COMPARISON:,PA AND LATERAL CHEST FILM AT INDICATION:,FINAL ADDENDUM HISTORY:,...,FINAL ADDENDUM CHEST PA AND LATERAL AMENDMENT:,CHEST RADIOGRAPHS PERFORMED ON COMPARISON:,CHEST OF HISTORY:,CHEST RADIOGRAPH INDICATIONS:,CHEST TWO VIEWS INDICATION:,AP CHEST ON HISTORY:,PA AND LATERAL CHEST CLINICAL INDICATION:,PA AND LATERAL CHEST ON HISTORY:,AP UPRIGHT AND LATERAL VIEW OF THE CHEST PERFORMED ON COMPARISON:,PA AND LATERAL CHEST OF WITH COMPARISON RADIOGRAPH FINDINGS:
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: F ...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR...",,,,,...,,,,,,,,,,
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR...",,,,,...,,,,,,,,,,
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...,EXAMINATION: CHEST (PORTABLE AP) INDICATION: F...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI...",,,,,...,,,,,,,,,,
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING...",,,,,...,,,,,,,,,,
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: M ...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI...",,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio...","[INDICATION:, COMPARISON:, FINDINGS:, CONCLUSI...",,,,,...,,,,,,,,,,
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...,INDICATION: -year-old with chest pain. TECHNIQ...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING...",,,,,...,,,,,,,,,,
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,COMPARISON: radiograph. FINDINGS: There has be...,"[COMPARISON:, FINDINGS:]",,,,,...,,,,,,,,,,
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,COMPARISON: Prior chest radiograph from earlie...,"[COMPARISON:, CLINICAL HISTORY:, FINDINGS:, IM...",,,,,...,,,,,,,,,,


# add true if header is in the file (this bit runs forever! only to double check the following code works well)

In [None]:
for headers in df['headers']:
    for header in headers:
        if header in df.columns:
            df[header] = df[header].fillna("False")
            df.loc[df[header].isna(), header] = "True"


In [None]:
df

# count how often each header appears

In [30]:
from collections import Counter

In [33]:
all_headers = ",".join(df["headers"].astype(str).tolist())
all_headers = all_headers.replace("[", "").replace("]", "")
all_names = [name.strip() for name in all_headers.split(",")]

# Count the occurrences of each name
name_counts = Counter(all_names)

for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {count}")


'IMPRESSION:': 184726
'COMPARISON:': 158784
'FINDINGS:': 148917
'INDICATION:': 140777
'EXAMINATION:': 93950
'TECHNIQUE:': 76669
'HISTORY:': 39474
'WET READ:': 17557
'CHEST RADIOGRAPH INDICATION:': 10082
'CLINICAL HISTORY:': 7061
'REASON FOR EXAMINATION:': 5836
'NOTIFICATION:': 5731
'CLINICAL INFORMATION:': 4016
'EXAM:': 3906
'STUDY:': 2780
'SINGLE FRONTAL VIEW OF THE CHEST REASON FOR EXAM:': 2657
'CHEST RADIOGRAPHS HISTORY:': 2299
'PNA TECHNIQUE:': 2129
'CONCLUSION:': 1800
'TWO VIEWS:': 1737
'TYPE OF EXAMINATION:': 1675
'CLINICAL INDICATION:': 1465
'ON HISTORY:': 1338
'PA AND LATERAL VIEWS OF THE CHEST REASON FOR EXAM:': 1294
'CHEST RADIOGRAPHS INDICATION:': 1079
'PNA COMPARISON:': 1047
'PA AND LATERAL VIEWS OF THE CHEST:': 914
'PNA IMPRESSION:': 881
'CHEST RADIOGRAPH HISTORY:': 835
'CHEST ON HISTORY:': 610
'FRONTAL AND LATERAL CHEST RADIOGRAPHS:': 603
'RAY INDICATION:': 541
'PA AND LATERAL CHEST RADIOGRAPHS:': 529
'CT FINDINGS:': 509
'PA AND LATERAL CHEST ON HISTORY:': 501
'PORTABLE C

# transform cleaned content to list of dict

In [35]:
def item_to_dict(item):
    key_value_pairs = []
    segments = re.split(r'([A-Z\s]+:)', item)
    for i in range(1, len(segments) - 1, 2):
        key_value_pairs.append({segments[i].strip(): segments[i + 1].strip()})
    return key_value_pairs


In [36]:
df['list_to_dict'] = df['cleaned_content'].apply(item_to_dict)

In [38]:
df['list_to_dict'][222000]

[{'INDICATION:': 'Hypoxic respiratory failure.'},
 {'COMPARISON:': 'Chest radiograph from .'},
 {'TECHNIQUE:': 'Frontal chest radiograph.'},
 {'IMPRESSION:': 'A right PICC terminates at the upper SVC. The heart size remains normal. The hilar and mediastinal contours are unchanged. Postsurgical changes are again seen at the left apex. An ill-defined left basilar opacity is unchanged. There is no pneumothorax or pleural effusion.'}]

# updating each row based on values (should replace all the "True")

In [39]:
# Function to update the DataFrame based on the list of dictionaries
def update_columns(row, col_name, df):
    dicts = row[col_name]
    for d in dicts:
        key, value = list(d.items())[0]
        if key in df.columns:
            df.loc[row.name, key] = f'{key} {value}'


In [40]:
for index, row in df.iterrows():
    update_columns(row, 'list_to_dict', df)

In [42]:
df["INDICATION:"][0]

'INDICATION: F with new onset ascites eval for infection'

In [43]:
df["TECHNIQUE:"][0]

'TECHNIQUE: Chest PA and lateral'

In [44]:
df["COMPARISON:"][0]

'COMPARISON: None.'

In [45]:
df["IMPRESSION:"][0]

'IMPRESSION: No acute cardiopulmonary process.'

In [47]:
df['headers'][227832]

['COMPARISON:', 'FINDINGS:']

In [48]:
df['cleaned_content'][227832]

'COMPARISON: radiograph. FINDINGS: There has been interval extubation and improved lung volumes compared to the recent radiograph. Bibasilar atelectasis has nearly resolved with residual patchy atelectasis remaining in the right lower lobe and only minimal residual linear atelectasis in the left lower lobe. Apparent rightward deviation of the trachea is likely due to mild patient rotation and curvature of the spine, as there is no evidence of a discrete paratracheal mass on recent neck CTA of . Cardiac silhouette is stable in size. No pleural effusion or pneumothorax.'

In [49]:
df['content'][227832]

'                                 FINAL REPORT\n PORTABLE CHEST OF ___\n \n COMPARISON:  ___ radiograph.\n \n FINDINGS:  There has been interval extubation and improved lung volumes\n compared to the recent radiograph.  Bibasilar atelectasis has nearly resolved\n with residual patchy atelectasis remaining in the right lower lobe and only\n minimal residual linear atelectasis in the left lower lobe.  Apparent\n rightward deviation of the trachea is likely due to mild patient rotation and\n curvature of the spine, as there is no evidence of a discrete paratracheal\n mass on recent neck CTA of ___.  Cardiac silhouette is stable in\n size.  No pleural effusion or pneumothorax.\n'

In [51]:
num_nans = df.isna().sum().sum()

print(num_nans) 

0


In [52]:
df

Unnamed: 0,subject_id,study_id,path,content,cleaned_content,headers,PA AND LATERAL HISTORY:,PORTABLE CHEST RADIOGRAPH DATED COMPARISON:,PA AND LATERAL CHEST FILM AT INDICATION:,FINAL ADDENDUM HISTORY:,...,CHEST RADIOGRAPHS PERFORMED ON COMPARISON:,CHEST OF HISTORY:,CHEST RADIOGRAPH INDICATIONS:,CHEST TWO VIEWS INDICATION:,AP CHEST ON HISTORY:,PA AND LATERAL CHEST CLINICAL INDICATION:,PA AND LATERAL CHEST ON HISTORY:,AP UPRIGHT AND LATERAL VIEW OF THE CHEST PERFORMED ON COMPARISON:,PA AND LATERAL CHEST OF WITH COMPARISON RADIOGRAPH FINDINGS:,list_to_dict
0,10000032,50414267,files/p10/p10000032/s50414267.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: F ...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR...",,,,,...,,,,,,,,,,"[{'EXAMINATION:': 'CHEST (PA AND LAT)'}, {'IND..."
1,10000032,53189527,files/p10/p10000032/s53189527.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...,"[EXAMINATION:, INDICATION:, TECHNIQUE:, COMPAR...",,,,,...,,,,,,,,,,"[{'EXAMINATION:': 'CHEST (PA AND LAT)'}, {'IND..."
2,10000032,53911762,files/p10/p10000032/s53911762.txt,FINAL REPORT\...,EXAMINATION: CHEST (PORTABLE AP) INDICATION: F...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI...",,,,,...,,,,,,,,,,"[{'EXAMINATION:': 'CHEST (PORTABLE AP)'}, {'IN..."
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING...",,,,,...,,,,,,,,,,[{'INDICATION:': 'year old woman with cirrhosi...
4,10000764,57375967,files/p10/p10000764/s57375967.txt,FINAL REPORT\...,EXAMINATION: CHEST (PA AND LAT) INDICATION: M ...,"[EXAMINATION:, INDICATION:, COMPARISON:, FINDI...",,,,,...,,,,,,,,,,"[{'EXAMINATION:': 'CHEST (PA AND LAT)'}, {'IND..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio...","[INDICATION:, COMPARISON:, FINDINGS:, CONCLUSI...",,,,,...,,,,,,,,,,"[{'INDICATION:': 'Patient with intubation, eva..."
227831,19999733,57132437,files/p19/p19999733/s57132437.txt,FINAL REPORT\...,INDICATION: -year-old with chest pain. TECHNIQ...,"[INDICATION:, TECHNIQUE:, COMPARISON:, FINDING...",,,,,...,,,,,,,,,,[{'INDICATION:': '-year-old with chest pain.'}...
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,COMPARISON: radiograph. FINDINGS: There has be...,"[COMPARISON:, FINDINGS:]",,,,,...,,,,,,,,,,"[{'COMPARISON:': 'radiograph.'}, {'FINDINGS:':..."
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,COMPARISON: Prior chest radiograph from earlie...,"[COMPARISON:, CLINICAL HISTORY:, FINDINGS:, IM...",,,,,...,,,,,,,,,,[{'COMPARISON:': 'Prior chest radiograph from ...


In [86]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [93]:
non_nan_counts = df.count()

# columns with less than 10,000 non-nan values
columns_to_drop = non_nan_counts[non_nan_counts < 10000].index.tolist()

df = df.drop(columns_to_drop, axis=1)

In [None]:
df

In [55]:
import numpy as np

In [100]:
mask = df.notna().any()

def sort_cols(x):
    if x.dtype == 'object' and any(isinstance(val, list) for val in x):
        x = x.apply(lambda val: np.nan if isinstance(val, list) else val)
    return x.sort_values(na_position='last')

sorted_cols = df.loc[:, mask].apply(sort_cols)

print(sorted_cols)


        subject_id  study_id                               path  \
0         10000032  50414267  files/p10/p10000032/s50414267.txt   
1         10000032  53189527  files/p10/p10000032/s53189527.txt   
2         10000032  53911762  files/p10/p10000032/s53911762.txt   
3         10000032  56699142  files/p10/p10000032/s56699142.txt   
4         10000764  57375967  files/p10/p10000764/s57375967.txt   
...            ...       ...                                ...   
227830    19999442  58708861  files/p19/p19999442/s58708861.txt   
227831    19999733  57132437  files/p19/p19999733/s57132437.txt   
227832    19999987  55368167  files/p19/p19999987/s55368167.txt   
227833    19999987  58621812  files/p19/p19999987/s58621812.txt   
227834    19999987  58971208  files/p19/p19999987/s58971208.txt   

                                                  content  \
0                                        FINAL REPORT\...   
1                                        FINAL REPORT\...   
2           

In [113]:
sorted_cols.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 227835 entries, 0 to 227834
Data columns (total 14 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   subject_id                    227835 non-null  int64 
 1   study_id                      227835 non-null  int64 
 2   path                          227835 non-null  object
 3   content                       227835 non-null  object
 4   cleaned_content               227834 non-null  object
 5   FINDINGS:                     148838 non-null  object
 6   TECHNIQUE:                    76649 non-null   object
 7   IMPRESSION:                   184636 non-null  object
 8   COMPARISON:                   158736 non-null  object
 9   INDICATION:                   140768 non-null  object
 10  WET READ:                     17556 non-null   object
 11  CHEST RADIOGRAPH INDICATION:  10082 non-null   object
 12  HISTORY:                      39471 non-null   object
 13 

In [111]:
num_nans = sorted_cols.isna().sum().sum()

In [112]:
num_nans

1179835

In [116]:
sorted_cols["cleaned_content"].isna()

0         False
1         False
2         False
3         False
4         False
          ...  
227830    False
227831    False
227832    False
227833    False
227834    False
Name: cleaned_content, Length: 227835, dtype: bool

In [115]:
sorted_cols.isna().sum()

subject_id                           0
study_id                             0
path                                 0
content                              0
cleaned_content                      1
FINDINGS:                        78997
TECHNIQUE:                      151186
IMPRESSION:                      43199
COMPARISON:                      69099
INDICATION:                      87067
WET READ:                       210279
CHEST RADIOGRAPH INDICATION:    217753
HISTORY:                        188364
EXAMINATION:                    133890
dtype: int64

In [118]:
empty_cleaned_content_rows = sorted_cols[sorted_cols['cleaned_content'].isnull()]
print(empty_cleaned_content_rows)

        subject_id  study_id                               path  \
101213    14463099  54168089  files/p14/p14463099/s54168089.txt   

                                                content cleaned_content  \
101213                                   FINAL REPORT\n             NaN   

       FINDINGS: TECHNIQUE: IMPRESSION: COMPARISON: INDICATION: WET READ:  \
101213       NaN        NaN         NaN         NaN         NaN       NaN   

       CHEST RADIOGRAPH INDICATION: HISTORY: EXAMINATION:  
101213                          NaN      NaN          NaN  


In [121]:
sorted_cols.dropna(subset=['cleaned_content'], inplace=True)

In [110]:
sorted_cols = sorted_cols.drop(['list_to_dict', 'headers'], axis=1)

In [114]:
sorted_cols["WET READ:"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
227830    NaN
227831    NaN
227832    NaN
227833    NaN
227834    NaN
Name: WET READ:, Length: 227835, dtype: object

In [127]:
non_null_cleaned_content_rows = sorted_cols[sorted_cols['WET READ:'].notnull()]

In [128]:
non_null_cleaned_content_rows

Unnamed: 0,subject_id,study_id,path,content,cleaned_content,FINDINGS:,TECHNIQUE:,IMPRESSION:,COMPARISON:,INDICATION:,WET READ:,CHEST RADIOGRAPH INDICATION:,HISTORY:,EXAMINATION:
19,10000980,58206436,files/p10/p10000980/s58206436.txt,WET READ: ___ ___ ___ 6:47 AM\n 1. New mild ...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,FINDINGS: In comparison to study performed on ...,TECHNIQUE: Single portable upright frontal che...,IMPRESSION: 1. New mild pulmonary edema with p...,COMPARISON: chest radiograph. chest radiograph.,INDICATION: F with wheezing and dyspnea. Asses...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,,,EXAMINATION: Chest radiograph.
28,10001217,58913004,files/p10/p10001217/s58913004.txt,WET READ: ___ ___ ___ 11:18 PM\n It is diffi...,WET READ: 11:18 PM It is difficult to determin...,FINDINGS: As compared to the previous radiogra...,,,COMPARISON: .,,WET READ: 11:18 PM It is difficult to determin...,CHEST RADIOGRAPH INDICATION: PICC line placement.,,
32,10001401,56534136,files/p10/p10001401/s56534136.txt,WET READ: ___ ___ ___ 6:21 AM\n \n An enter...,WET READ: 6:21 AM An enteric tube courses belo...,FINDINGS: The right costophrenic angle is not ...,TECHNIQUE: Single AP view,IMPRESSION: An enteric tube courses below the ...,COMPARISON: Chest radiograph from the same date.,INDICATION: History: F with SBO s/p NG*** WARN...,WET READ: 6:21 AM An enteric tube courses belo...,,,EXAMINATION: Chest radiograph.
51,10001884,55333410,files/p10/p10001884/s55333410.txt,WET READ: ___ ___ ___ 4:36 PM\n No acute car...,WET READ: 4:36 PM No acute cardiopulmonary abn...,FINDINGS: PA and lateral views the chest provi...,TECHNIQUE: Chest PA and lateral,IMPRESSION: No acute findings. Top-normal hear...,COMPARISON: Chest radiograph and chest CT from .,"INDICATION: y.o. woman, multiple medical probl...",WET READ: 4:36 PM No acute cardiopulmonary abn...,,,EXAMINATION: Chest radiograph
90,10002428,56597576,files/p10/p10002428/s56597576.txt,WET READ: ___ ___ ___ 9:42 AM\n LEFT PICC TI...,WET READ: 9:42 AM LEFT PICC TIP PROJECTS OVER ...,FINDINGS: Comparison is made to previous study...,,,,,WET READ: 9:42 AM LEFT PICC TIP PROJECTS OVER ...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227714,19997367,52358840,files/p19/p19997367/s52358840.txt,WET READ: ___ ___ ___ 9:04 PM\n Fluid seems ...,WET READ: 9:04 PM Fluid seems to be reaccumula...,"FINDINGS: As compared to the previous image, t...",,,COMPARISON: .,INDICATION: Evaluation for pneumothorax. Statu...,WET READ: 9:04 PM Fluid seems to be reaccumula...,,,
227720,19997367,52790106,files/p19/p19997367/s52790106.txt,WET READ: ___ ___ ___ 7:42 AM\n \n \n Incr...,WET READ: 7:42 AM Increasing interstitial mark...,,TECHNIQUE: Chest two views,IMPRESSION: Increasing interstitial markings f...,COMPARISON:,INDICATION: year old woman with complicated pa...,WET READ: 7:42 AM Increasing interstitial mark...,,,EXAMINATION: CHEST (PA AND LAT)
227749,19997367,56627054,files/p19/p19997367/s56627054.txt,WET READ: ___ ___ ___ 9:20 PM\n Compared to ...,WET READ: 9:20 PM Compared to the prior radiog...,,,IMPRESSION: IN COMPARISON WITH THE EARLIER STU...,,INDICATION: year old woman with chest tube eff...,WET READ: 9:20 PM Compared to the prior radiog...,,,EXAMINATION: CHEST (PORTABLE AP)
227781,19997911,58942262,files/p19/p19997911/s58942262.txt,WET READ: ___ ___ ___ 8:13 PM\n NG tube tip ...,WET READ: 8:13 PM NG tube tip terminates in th...,,,"IMPRESSION: Comparison to , 10:12. The tip of ...",,INDICATION: year old woman s/p NGT placement P...,WET READ: 8:13 PM NG tube tip terminates in th...,,,EXAMINATION: CHEST (PORTABLE AP)


# pull single image cases 

In [130]:
df3 = pd.read_csv('/Users/katya/Downloads/cxr-record-list.csv')

In [141]:
value_counts2 = df3['study_id'].value_counts()

In [142]:
value_counts2

54019440    11
50022785     9
54914372     8
50384171     8
54267739     8
            ..
50158746     1
59249530     1
55895881     1
53492489     1
58971208     1
Name: study_id, Length: 227835, dtype: int64

In [143]:
result2 = value_counts2[value_counts2 == 1]
result2

56237305    1
54925626    1
55351970    1
57005648    1
50273602    1
           ..
50158746    1
59249530    1
55895881    1
53492489    1
58971208    1
Name: study_id, Length: 102675, dtype: int64

In [148]:
result3=df3['study_id'].nunique

In [150]:
result3

<bound method IndexOpsMixin.nunique of 0         50414267
1         50414267
2         53189527
3         53189527
4         53911762
            ...   
377105    57132437
377106    57132437
377107    55368167
377108    58621812
377109    58971208
Name: study_id, Length: 377110, dtype: int64>

In [151]:
new_df = sorted_cols[sorted_cols['study_id'].isin(result2.index)]

In [155]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102675 entries, 3 to 227834
Data columns (total 14 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   subject_id                    102675 non-null  int64 
 1   study_id                      102675 non-null  int64 
 2   path                          102675 non-null  object
 3   content                       102675 non-null  object
 4   cleaned_content               102675 non-null  object
 5   FINDINGS:                     52918 non-null   object
 6   TECHNIQUE:                    24673 non-null   object
 7   IMPRESSION:                   75268 non-null   object
 8   COMPARISON:                   60257 non-null   object
 9   INDICATION:                   63133 non-null   object
 10  WET READ:                     9205 non-null    object
 11  CHEST RADIOGRAPH INDICATION:  6351 non-null    object
 12  HISTORY:                      15722 non-null   object
 13 

In [158]:
df_new = new_df.loc[:, new_df.count().sort_values(ascending=False).index]

In [159]:
df_new

Unnamed: 0,subject_id,study_id,path,content,cleaned_content,IMPRESSION:,INDICATION:,COMPARISON:,FINDINGS:,EXAMINATION:,TECHNIQUE:,HISTORY:,WET READ:,CHEST RADIOGRAPH INDICATION:
3,10000032,56699142,files/p10/p10000032/s56699142.txt,FINAL REPORT\...,INDICATION: year old woman with cirrhosis. TEC...,IMPRESSION: No acute cardiopulmonary process.,INDICATION: year old woman with cirrhosis.,"COMPARISON: Radiographs from , and .",FINDINGS: The lungs are clear of focal consoli...,,TECHNIQUE: Frontal chest radiographs were obta...,,,
7,10000935,50578979,files/p10/p10000935/s50578979.txt,FINAL REPORT\...,"HISTORY: Leukocytosis, low-grade temperature, ...",IMPRESSION: 1. Low lung volumes and mild pulmo...,,COMPARISON: Multiple prior radiographs of the ...,FINDINGS: Lung volumes remain low. There are i...,,TECHNIQUE: Portable semi-upright AP radiograph...,"HISTORY: Leukocytosis, low-grade temperature, ...",,
12,10000935,58219844,files/p10/p10000935/s58219844.txt,FINAL REPORT\...,HISTORY: Dyspnea and history of lung cancer. T...,IMPRESSION: Innumerable pulmonary metastases. ...,,COMPARISON: CT torso and chest radiograph .,FINDINGS: Lung volumes are low. This results i...,,TECHNIQUE: Semi-upright AP view of the chest.,HISTORY: Dyspnea and history of lung cancer.,,
14,10000980,51967283,files/p10/p10000980/s51967283.txt,FINAL REPORT\...,INDICATION: -year-old female with shortness of...,IMPRESSION: Right upper lobe pneumonia or mass...,INDICATION: -year-old female with shortness of...,COMPARISON: Chest radiograph from and .,,,,,,
19,10000980,58206436,files/p10/p10000980/s58206436.txt,WET READ: ___ ___ ___ 6:47 AM\n 1. New mild ...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,IMPRESSION: 1. New mild pulmonary edema with p...,INDICATION: F with wheezing and dyspnea. Asses...,COMPARISON: chest radiograph. chest radiograph.,FINDINGS: In comparison to study performed on ...,EXAMINATION: Chest radiograph.,TECHNIQUE: Single portable upright frontal che...,,WET READ: 6:47 AM 1. New mild pulmonary edema ...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227829,19999442,58497551,files/p19/p19999442/s58497551.txt,FINAL REPORT\...,"REASON FOR EXAMINATION: Aspiration suspected, ...",,,,,,,,,
227830,19999442,58708861,files/p19/p19999442/s58708861.txt,FINAL REPORT\...,"INDICATION: Patient with intubation, evaluatio...",,"INDICATION: Patient with intubation, evaluatio...",COMPARISON: None.,FINDINGS: ET tube ends 4.7 cm above the carina...,,,,,
227832,19999987,55368167,files/p19/p19999987/s55368167.txt,FINAL REPORT\...,COMPARISON: radiograph. FINDINGS: There has be...,,,COMPARISON: radiograph.,FINDINGS: There has been interval extubation a...,,,,,
227833,19999987,58621812,files/p19/p19999987/s58621812.txt,FINAL REPORT\...,COMPARISON: Prior chest radiograph from earlie...,IMPRESSION: Appropriately positioned ET and NG...,,COMPARISON: Prior chest radiograph from earlie...,FINDINGS: Portable supine AP view of the chest...,,,,,


In [160]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102675 entries, 3 to 227834
Data columns (total 14 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   subject_id                    102675 non-null  int64 
 1   study_id                      102675 non-null  int64 
 2   path                          102675 non-null  object
 3   content                       102675 non-null  object
 4   cleaned_content               102675 non-null  object
 5   IMPRESSION:                   75268 non-null   object
 6   INDICATION:                   63133 non-null   object
 7   COMPARISON:                   60257 non-null   object
 8   FINDINGS:                     52918 non-null   object
 9   EXAMINATION:                  46559 non-null   object
 10  TECHNIQUE:                    24673 non-null   object
 11  HISTORY:                      15722 non-null   object
 12  WET READ:                     9205 non-null    object
 13 

In [161]:
# write out the updated dataframe to a new csv file
df_new.to_csv('cleaned_single_image_study_id.csv', index=False)