In [None]:
#!/usr/bin/env python

'''
    RadCLIP Dataset Creators
    
'''
__author__ = "Andrew D'Amico"
__copyright__ = "Copyright 2023"
__credits__ = ["Andrew D'Amico", "Christoper Alexander", "Katya Nosulko", "Vivek Chamala", "Matthew Conger"]
__license__ = ""
__version__ = "0.0.1"
__maintainer__ = "Andrew Damico"
__email__ = "andrew.damico@u.northwestern.edu"

In [2]:
import re

import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Dataset

In [3]:
folder = "/media/andrew/HD-GDU3/498/"
imageset_folder = "complete_imageset/"
dataset_folder = "preprocessed_single_image_text/"
reports = "raw.csv"
text_loc = folder + dataset_folder + reports
image_loc = folder + imageset_folder
#md_location = folder + meta_data
#meta_reports = pd.read_csv(md_location)
#sample = meta_reports.head()
#sample

## Raw Mimic Reports

In [4]:
#raw = pd.read_csv('raw.csv')
raw = pd.read_csv(text_loc)
print(f'patient records: {text_loc}')
print(f"records: {len(raw)}")
raw.head()

patient records: /media/andrew/HD-GDU3/498/preprocessed_single_image_text/raw.csv
records: 76112


Unnamed: 0,subject_id,study_id,path,content,cleaned_content,IMPRESSION:,FINDINGS:,WET READ:,CONCLUSION:
0,10000935,50578979,files/p10/p10000935/s50578979.txt,FINAL REPORT\...,"HISTORY: Leukocytosis, low-grade temperature, ...",IMPRESSION: 1. Low lung volumes and mild pulmo...,FINDINGS: Lung volumes remain low. There are i...,,
1,10000935,58219844,files/p10/p10000935/s58219844.txt,FINAL REPORT\...,HISTORY: Dyspnea and history of lung cancer. T...,IMPRESSION: Innumerable pulmonary metastases. ...,FINDINGS: Lung volumes are low. This results i...,,
2,10000980,51967283,files/p10/p10000980/s51967283.txt,FINAL REPORT\...,INDICATION: -year-old female with shortness of...,IMPRESSION: Right upper lobe pneumonia or mass...,,,
3,10000980,58206436,files/p10/p10000980/s58206436.txt,WET READ: ___ ___ ___ 6:47 AM\n 1. New mild ...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,IMPRESSION: 1. New mild pulmonary edema with p...,FINDINGS: In comparison to study performed on ...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,
4,10001217,58913004,files/p10/p10001217/s58913004.txt,WET READ: ___ ___ ___ 11:18 PM\n It is diffi...,WET READ: 11:18 PM It is difficult to determin...,,FINDINGS: As compared to the previous radiogra...,WET READ: 11:18 PM It is difficult to determin...,


In [59]:
filenames = pd.read_csv('Datasets/image_locations.csv')
#filenames = pd.read_csv('image_locations.csv')
print(f'Total images found {len(filenames)}')

Total images found 82263


### Feature Selection
Identify only the features we wish to include in the new dataset and rename

In [7]:
# Create a dictionary of column names and feature names
feature_dict = {
    'subject_id': 'subject',
    'study_id': 'study',
    'IMPRESSION:': 'impression',
    'FINDINGS:': 'findings',
    'WET READ:': 'wet_read',
    'CONCLUSION:': 'conclusion'
}

In [8]:
data = raw[feature_dict.keys()].copy().rename(columns=feature_dict)

In [9]:
data.head()

Unnamed: 0,subject,study,impression,findings,wet_read,conclusion
0,10000935,50578979,IMPRESSION: 1. Low lung volumes and mild pulmo...,FINDINGS: Lung volumes remain low. There are i...,,
1,10000935,58219844,IMPRESSION: Innumerable pulmonary metastases. ...,FINDINGS: Lung volumes are low. This results i...,,
2,10000980,51967283,IMPRESSION: Right upper lobe pneumonia or mass...,,,
3,10000980,58206436,IMPRESSION: 1. New mild pulmonary edema with p...,FINDINGS: In comparison to study performed on ...,WET READ: 6:47 AM 1. New mild pulmonary edema ...,
4,10001217,58913004,,FINDINGS: As compared to the previous radiogra...,WET READ: 11:18 PM It is difficult to determin...,


# Preprocess Text

In [10]:
def process_text(text, exclude=None):
    '''
    processes the sentences to remove special characters, convert to lowercase, etc.
    
    text: text to be processed
    exclude: words to be removed
    punctuation: boolean to add period at end of each sentence
    '''

    # Remove the Headings from the text
    for k in exclude:
        text = text.replace(k, '')

    # Remove all timestamps
    timestamp = re.compile(r'\d+\:\d{2}\s?(?:AM|PM|am|pm)\s')
    text = timestamp.sub('', text)

    # Remove all numbered lists.
    numbered_lists = re.compile(r'\d+.\s')
    text = numbered_lists.sub('', text)

    # Remove all extra whitespaces
    text = re.sub(' +', ' ', text).strip()

    #m = re.sub(r'\b[A-Z]+: ', '', m)

    # Convert all to lowercase
    text = re.sub(r'\d+.', '', text).lower()

    return (text)

# Create Initial Dataset

In [11]:
def preprocess_dataset(data, images, features, name='dataset.csv', save=False):
    '''
    Creates a dataframe of sentences from the clinical reports which 
    have been preprocessed using the rules above
    
    data: dataset of reports
    images: csv containing subject/study and image location
    features: feature dictionary including headings to be removed
    name: name to save the dataset to disk
    save: boolean to state if save should be performed
    '''

    counter = 0

    corpus = []

    printout = []

    for observation in tqdm(range(len(data))):
        subject_id = data['subject'][observation]
        study_id = data['study'][observation]

        try:
            #lookup the image... if an image is not found for the study, we will skip that report.
            file_id = images[images['study_id'] == study_id]['resized_file_name'].values[0]
            # For each of the features in our list:
            for category in list(features.values())[2:]:
                # If it is a string rather than a bool i.e., NaN,
                if isinstance(data[category][observation], str):
                    # Preprocess the report
                    reported_observation = process_text(
                        text=data[category][observation],
                        exclude=list(features.keys())
                    )

                    # Break into sentences
                    sentences = sent_tokenize(reported_observation)

                    # For each sentence, create a new record
                    for sentence in sentences:
                        container = {
                            'subject': subject_id,
                            'study': study_id,
                            'type': category,
                            'value': re.sub(r"[^a-zA-Z0-9 ]", "", sentence),
                            'file_name': file_id,
                            'length': len(sentence.split()),
                            'note': None,
                        }

                        corpus.append(container)

        except Exception as e:
            #print(e)
            printout.append(f'Error processing subject/study {subject_id}/{study_id}')
            counter += 1

    print("")
    print(f'Total not found: {counter}')
    print(printout)

    corpus = pd.DataFrame(corpus)
    corpus = corpus.drop(corpus[corpus['value'] == ''].index, inplace=False)
    corpus['candidate'] = True

    if save:
        corpus.to_csv(name, index=False)
        print(f'Corpus saved to {name}')

    return (corpus)

# Create or Load Dataset

In [None]:
def parse_dataset(name, run=True, save=False):
    if run:
        dataset = preprocess_dataset(
            data=data,
            images=filenames,
            features=feature_dict,
            name=name,
            save=save
        )
    else:
        dataset = pd.read_csv(name)

    print(f'Total records: {len(dataset)}')

    return (dataset)#%%
import re

import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

nltk.download('punkt')


# Dataset

In [None]:
folder = "/media/andrew/HD-GDU3/498/"
imageset_folder = "complete_imageset/"
dataset_folder = "preprocessed_single_image_text/"
reports = "raw.csv"
text_loc = folder + dataset_folder + reports
image_loc = folder + imageset_folder
#md_location = folder + meta_data
#meta_reports = pd.read_csv(md_location)
#sample = meta_reports.head()
#sample

## Raw Mimic Reports

In [None]:
#raw = pd.read_csv('raw.csv')
raw = pd.read_csv(text_loc)
print(f'patient records: {text_loc}')
print(f"records: {len(raw)}")
raw.head()

In [None]:
filenames = pd.read_csv('Datasets/image_locations.csv')
#filenames = pd.read_csv('image_locations.csv')
print(f'Total images found {len(filenames)}')

### Feature Selection
Identify only the features we wish to include in the new dataset and rename

In [None]:
# Create a dictionary of column names and feature names
feature_dict = {
    'subject_id': 'subject',
    'study_id': 'study',
    'IMPRESSION:': 'impression',
    'FINDINGS:': 'findings',
    'WET READ:': 'wet_read',
    'CONCLUSION:': 'conclusion'
}

In [None]:
data = raw[feature_dict.keys()].copy().rename(columns=feature_dict)

In [None]:
data.head()

# Preprocess Text

In [None]:
def process_text(text, exclude=None):
    '''
    processes the sentences to remove special characters, convert to lowercase, etc.

    text: text to be processed
    exclude: words to be removed
    punctuation: boolean to add period at end of each sentence
    '''

    # Remove the Headings from the text
    for k in exclude:
        text = text.replace(k, '')

    # Remove all timestamps
    timestamp = re.compile(r'\d+\:\d{2}\s?(?:AM|PM|am|pm)\s')
    text = timestamp.sub('', text)

    # Remove all numbered lists.
    numbered_lists = re.compile(r'\d+.\s')
    text = numbered_lists.sub('', text)

    # Remove all extra whitespaces
    text = re.sub(' +', ' ', text).strip()

    #m = re.sub(r'\b[A-Z]+: ', '', m)

    # Convert all to lowercase
    text = re.sub(r'\d+.', '', text).lower()

    return (text)

# Create Initial Dataset

In [None]:
def preprocess_dataset(data, images, features, name='dataset.csv', save=False):
    '''
    Creates a dataframe of sentences from the clinical reports which
    have been preprocessed using the rules above

    data: dataset of reports
    images: csv containing subject/study and image location
    features: feature dictionary including headings to be removed
    name: name to save the dataset to disk
    save: boolean to state if save should be performed
    '''

    counter = 0

    corpus = []

    printout = []

    for observation in tqdm(range(len(data))):
        subject_id = data['subject'][observation]
        study_id = data['study'][observation]

        try:
            #lookup the image... if an image is not found for the study, we will skip that report.
            file_id = images[images['study_id'] == study_id]['resized_file_name'].values[0]
            # For each of the features in our list:
            for category in list(features.values())[2:]:
                # If it is a string rather than a bool i.e., NaN,
                if isinstance(data[category][observation], str):
                    # Preprocess the report
                    reported_observation = process_text(
                        text=data[category][observation],
                        exclude=list(features.keys())
                    )

                    # Break into sentences
                    sentences = sent_tokenize(reported_observation)

                    # For each sentence, create a new record
                    for sentence in sentences:
                        container = {
                            'subject': subject_id,
                            'study': study_id,
                            'type': category,
                            'value': re.sub(r"[^a-zA-Z0-9 ]", "", sentence),
                            'file_name': file_id,
                            'length': len(sentence.split()),
                            'note': None,
                        }

                        corpus.append(container)

        except Exception as e:
            #print(e)
            printout.append(f'Error processing subject/study {subject_id}/{study_id}')
            counter += 1

    print("")
    print(f'Total not found: {counter}')
    print(printout)

    corpus = pd.DataFrame(corpus)
    corpus = corpus.drop(corpus[corpus['value'] == ''].index, inplace=False)
    corpus['candidate'] = True

    if save:
        corpus.to_csv(name, index=False)
        print(f'Corpus saved to {name}')

    return (corpus)

# Create or Load Dataset

In [None]:
def parse_dataset(name, run=True, save=False):
    if run:
        dataset = preprocess_dataset(
            data=data,
            images=filenames,
            features=feature_dict,
            name=name,
            save=save
        )
    else:
        dataset = pd.read_csv(name)

    print(f'Total records: {len(dataset)}')

    return (dataset)

In [None]:
dataset = parse_dataset(name='dataset.csv', run=False, save=True)

# Stop Words

In [None]:
# load the JSON file with the embeddings. If embeddings have been created using the clustering scripts, the dataframe has to be saved in JSON to preserve the embeddings.

In [None]:
dataset = pd.read_json('dataset_embeddings_2.json')

In [None]:
# Add stop words/sentences based on cluster experiments

In [None]:
exclusion_list = [
    'as compared to the previous',
    'compared to prior',
    'the patient has been extubated',
    'ap portable semi upright view of the chest',
    'as above',
    #'ap chest radiograph',
    'no comparison',
    'unchanged',
    'in comparison with',
    'ap chest compared to',
    'dr. ',
    'dr ',
    'doctor',
    'status post',
    'recommended',
    'telephone',
    'phone',
    'status post cabg',
    'otherwise little change',
    'no prior',
    'at the time of dictation',
    'this preliminary report',
    'clinical correlation',
    'no previous image',
    'followup',
    'portable chest',
    'chest portable',
    'comparison',
    'no other interval change from prior study',
    'by  md',
    'dw dr',
    'md ',
    #'change',
    'ed urgent attention',
    'has been removed',
    'analysis is performed',
    'no relevant change',
    'no change',
    'paged',
    'yesterday',
    'wet read version',
    'no relevant change',
    'prior study',
    'rotated positioning',
    'see comment',
    'limited exam',
]

In [None]:
def filter_pairs(db, filename = 'dataset', json=False, exclusion=exclusion_list, save=True):
    '''
    Filters out sentences based on exlusion list'
    '''

    #db.loc[db['value'].str.len() <4, "note"] = f"Removal: Less than 3 char"
    #db.loc[db['value'].str.len() <4, 'candidate'] = False
    db['value'] = db['value'].apply(lambda x: x.strip())
    #df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

    for term in exclusion:
        print (f'''The term or phrase '{term}' has been removed.''')
        #first search for the term in all activate candidate cells and record rule
        db.loc[(db['candidate'] == True) & (db['value'].str.contains(term)), "note"] = f"Removal: {term}"
        # if the note contains an exlusion rule, set the candidate flag to False
        db.loc[(db['candidate'] == True) & (db['value'].str.contains(term)), "candidate"] = False
        #complete.loc[complete['value'].str.contains(term), "note" = f"{term}"
        #for term in exclusion:
        #complete.loc[complete['note'] == True, "candidate"] = False
        #complete.loc[complete['value'].str.contains(term), "note"] = f"Removal: {term}"

    # remove cells with less than 3 characters

    db.loc[(db['candidate'] == True) & (db['value'].str.len() <3), "note"] = f"Removal: Less than 3 char"
    db.loc[(db['candidate'] == True) & (db['value'].str.len() <3), 'candidate'] = False
    #df.loc[(df['col1'].str.len() == 5) & (df['col2'].str.len() == 7)]

    if save:

        refined_name = 'refined_'+filename
        rejected_name = 'rejected_'+filename
        all_pairs = 'all_'+filename

        print ("")

        if json:

            db[db['candidate'] == True].to_json(f'{refined_name}.json')
            print (f'Candidates saved: {refined_name}')

            db[db['candidate'] == False].dropna().sort_values(by=['note']).to_json(f'{rejected_name}.json')
            print (f'Rejected saved: {rejected_name}')

            db.to_json(f'{all_pairs}.json')
            print (f'All pairs saved: {all_pairs}')

        else:
            db[db['candidate'] == True].to_csv(f'{refined_name}.csv', index=False)
            print (f'Candidates saved to csv: {refined_name}')

            db[db['candidate'] == False].dropna().sort_values(by=['note']).to_csv(f'{rejected_name}.csv', index=False)
            print (f'Rejected saved to csv: {rejected_name}')

            db.to_csv(f'{all_pairs}.csv', index=False)
            print (f'All pairs saved to csv: {all_pairs}')

    return db[db['candidate'] == True]

In [None]:
candidates = filter_pairs(
    filename = 'dataset4',
    db = dataset,
    save = True,
    json = True
    )

# Adding or Removing additional Sentences

In [None]:
# load all previous sentences before removal

In [None]:
candidates = pd.read_json('Datasets/all_dataset4.json')

In [None]:
# Candidates to Add

In [None]:
toadd = pd.read_csv('Datasets/to_add.csv')

In [None]:
# Candidates to Reject

In [None]:
newrejects = pd.read_csv('new_rejects.csv')

In [None]:
# Remove Clustering Rejections
for item in newrejects['Sentence']:
    if candidates['candidate'][item] == True:
        candidates['candidate'][item] = False

In [None]:
candidates.to_json("new_dataset.json")

# Create training dataset (By Sentence or By Pair)

In [None]:
# The following script will create the final image/text pairs, either by sentence or by paragraph

In [None]:
def create_refined_pairs(db, debug = False, as_paragraph = True, save_name = "candidates", save = True):

    #Create list of studies with candidate sentences
    studies = db['study'].unique()

    #create empty dataframe
    training_pairs = pd.DataFrame(columns=['caption','image'])

    #for each study in the list.
    record_sentences = []
    record_paragraphs = []

    for study in tqdm(studies):
        if debug: print (f'Study: {study}')
        image = folder + imageset_folder + db['file_name'][db.loc[db['study'] == study].index[0]]
        if debug: print (f'image: {image}')
        # For each study, create a list of sentences
        container = []
        sentences = []

        ### WARNING: The following warning has been surpressed.
        import warnings
        warnings.simplefilter(action='ignore', category=FutureWarning)

        #If either impression or fingding is found, then wet read will not be used.
        if ['impression','findings'] in db.loc[db["study"] == study]['type'].unique():
            for sentence in db[(db['study'] == study) & (db['type'] != 'wet_read')]['value']:
                if debug: print (sentence)

                container.append(f'{sentence}.')

                record = {
                    'caption':f'{sentence}.',
                    'image':image
                }

                sentences.append(record)
        else:

            for sentence in db[db["study"] == study]['value']:
                if debug: print (sentence)

                container.append(f'{sentence}.')

                record = {
                    'caption':f'{sentence}.',
                    'image':image
                }

                sentences.append(record)
            #print (sentences)

        record_sentences += sentences

        #For each list of sentences, recreate a paragraph
        paragraph = ''
        if debug: print (container)
        for statement in container:
            paragraph += statement + " "
            if debug: print (paragraph)
        if debug: print (paragraph)

        record = {
            'caption':paragraph,
            'image':image
            }

        record_paragraphs.append(record)
        if debug: print ("-----")

    record_sentences = pd.DataFrame(record_sentences)
    record_paragraphs = pd.DataFrame(record_paragraphs)
        #print (training_pairs)

    if save:
        record_sentences.to_csv(f'{save_name}_sentences.csv')
        record_paragraphs.to_csv(f'{save_name}_paragraphs.csv')

    return (record_sentences, record_paragraphs)

In [None]:
sentences, records = create_refined_pairs(
    db = candidates,
    debug = False,
    save_name = "final_candidates",
    save = True
)

In [None]:
records#%%
dataset = parse_dataset(name='dataset.csv', run=False, save=True)

# Stop Words

In [60]:
# load the JSON file with the embeddings. If embeddings have been created using the clustering scripts, the dataframe has to be saved in JSON to preserve the embeddings. 

In [18]:
dataset = pd.read_json('dataset_embeddings_2.json')

In [20]:
# Add stop words/sentences based on cluster experiments

In [14]:
exclusion_list = [
    'as compared to the previous',
    'compared to prior',
    'the patient has been extubated',
    'ap portable semi upright view of the chest',
    'as above',
    #'ap chest radiograph',
    'no comparison',
    'unchanged',
    'in comparison with',
    'ap chest compared to',
    'dr. ',
    'dr ',
    'doctor',
    'status post',
    'recommended',
    'telephone',
    'phone',
    'status post cabg',
    'otherwise little change',
    'no prior',
    'at the time of dictation',
    'this preliminary report',
    'clinical correlation',
    'no previous image',
    'followup',
    'portable chest',
    'chest portable',
    'comparison',
    'no other interval change from prior study',
    'by  md',
    'dw dr',
    'md ',
    #'change',
    'ed urgent attention',
    'has been removed',
    'analysis is performed',
    'no relevant change',
    'no change',
    'paged',
    'yesterday',
    'wet read version',
    'no relevant change',
    'prior study',
    'rotated positioning',
    'see comment',
    'limited exam',
]

In [15]:
def filter_pairs(db, filename = 'dataset', json=False, exclusion=exclusion_list, save=True):
    '''
    Filters out sentences based on exlusion list'
    '''
    
    #db.loc[db['value'].str.len() <4, "note"] = f"Removal: Less than 3 char"
    #db.loc[db['value'].str.len() <4, 'candidate'] = False
    db['value'] = db['value'].apply(lambda x: x.strip())
    #df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    
    for term in exclusion:
        print (f'''The term or phrase '{term}' has been removed.''')
        #first search for the term in all activate candidate cells and record rule
        db.loc[(db['candidate'] == True) & (db['value'].str.contains(term)), "note"] = f"Removal: {term}"
        # if the note contains an exlusion rule, set the candidate flag to False
        db.loc[(db['candidate'] == True) & (db['value'].str.contains(term)), "candidate"] = False
        #complete.loc[complete['value'].str.contains(term), "note" = f"{term}"
        #for term in exclusion:
        #complete.loc[complete['note'] == True, "candidate"] = False
        #complete.loc[complete['value'].str.contains(term), "note"] = f"Removal: {term}"
    
    # remove cells with less than 3 characters

    db.loc[(db['candidate'] == True) & (db['value'].str.len() <3), "note"] = f"Removal: Less than 3 char"
    db.loc[(db['candidate'] == True) & (db['value'].str.len() <3), 'candidate'] = False
    #df.loc[(df['col1'].str.len() == 5) & (df['col2'].str.len() == 7)]
    
    if save:
        
        refined_name = 'refined_'+filename
        rejected_name = 'rejected_'+filename
        all_pairs = 'all_'+filename
        
        print ("")
        
        if json:
            
            db[db['candidate'] == True].to_json(f'{refined_name}.json')
            print (f'Candidates saved: {refined_name}')

            db[db['candidate'] == False].dropna().sort_values(by=['note']).to_json(f'{rejected_name}.json')
            print (f'Rejected saved: {rejected_name}')

            db.to_json(f'{all_pairs}.json')
            print (f'All pairs saved: {all_pairs}')
        
        else:
            db[db['candidate'] == True].to_csv(f'{refined_name}.csv', index=False)
            print (f'Candidates saved to csv: {refined_name}')

            db[db['candidate'] == False].dropna().sort_values(by=['note']).to_csv(f'{rejected_name}.csv', index=False)
            print (f'Rejected saved to csv: {rejected_name}')

            db.to_csv(f'{all_pairs}.csv', index=False)
            print (f'All pairs saved to csv: {all_pairs}')
    
    return db[db['candidate'] == True]

In [94]:
candidates = filter_pairs(
    filename = 'dataset4',
    db = dataset,
    save = True,
    json = True
    )

The term or phrase 'as compared to the previous' has been removed.
The term or phrase 'compared to prior' has been removed.
The term or phrase 'the patient has been extubated' has been removed.
The term or phrase 'ap portable semi upright view of the chest' has been removed.
The term or phrase 'as above' has been removed.
The term or phrase 'ap chest radiograph' has been removed.
The term or phrase 'no comparison' has been removed.
The term or phrase 'unchanged' has been removed.
The term or phrase 'in comparison with' has been removed.
The term or phrase 'ap chest compared to' has been removed.
The term or phrase 'dr. ' has been removed.
The term or phrase 'dr ' has been removed.
The term or phrase 'doctor' has been removed.
The term or phrase 'status post' has been removed.
The term or phrase 'recommended' has been removed.
The term or phrase 'telephone' has been removed.
The term or phrase 'phone' has been removed.
The term or phrase 'status post cabg' has been removed.
The term or 

# Adding or Removing additional Sentences

In [18]:
# load all previous sentences before removal

In [17]:
candidates = pd.read_json('Datasets/all_dataset4.json')

In [None]:
# Candidates to Add

In [21]:
toadd = pd.read_csv('Datasets/to_add.csv')

In [None]:
# Candidates to Reject

In [47]:
newrejects = pd.read_csv('new_rejects.csv')

In [56]:
# Remove Clustering Rejections
for item in newrejects['Sentence']:
    if candidates['candidate'][item] == True:
        candidates['candidate'][item] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidates['candidate'][item] = False


In [None]:
candidates.to_json("new_dataset.json")

# Create training dataset (By Sentence or By Pair)

In [61]:
# The following script will create the final image/text pairs, either by sentence or by paragraph

In [169]:
def create_refined_pairs(db, debug = False, as_paragraph = True, save_name = "candidates", save = True):
    
    #Create list of studies with candidate sentences
    studies = db['study'].unique()
    
    #create empty dataframe
    training_pairs = pd.DataFrame(columns=['caption','image'])
    
    #for each study in the list.
    record_sentences = []
    record_paragraphs = []
    
    for study in tqdm(studies):
        if debug: print (f'Study: {study}')
        image = folder + imageset_folder + db['file_name'][db.loc[db['study'] == study].index[0]]
        if debug: print (f'image: {image}')
        # For each study, create a list of sentences
        container = []
        sentences = []
        
        ### WARNING: The following warning has been surpressed.
        import warnings
        warnings.simplefilter(action='ignore', category=FutureWarning)
        
        #If either impression or fingding is found, then wet read will not be used.
        if ['impression','findings'] in db.loc[db["study"] == study]['type'].unique():
            for sentence in db[(db['study'] == study) & (db['type'] != 'wet_read')]['value']:
                if debug: print (sentence)
                
                container.append(f'{sentence}.')
                
                record = {
                    'caption':f'{sentence}.',
                    'image':image
                }
                
                sentences.append(record)
        else:
            
            for sentence in db[db["study"] == study]['value']:
                if debug: print (sentence)

                container.append(f'{sentence}.')

                record = {
                    'caption':f'{sentence}.',
                    'image':image
                }

                sentences.append(record)
            #print (sentences)
            
        record_sentences += sentences
        
        #For each list of sentences, recreate a paragraph
        paragraph = ''
        if debug: print (container)
        for statement in container:
            paragraph += statement + " "
            if debug: print (paragraph)
        if debug: print (paragraph)

        record = {
            'caption':paragraph,
            'image':image
            }
        
        record_paragraphs.append(record)
        if debug: print ("-----")

    record_sentences = pd.DataFrame(record_sentences)
    record_paragraphs = pd.DataFrame(record_paragraphs)
        #print (training_pairs)
        
    if save:
        record_sentences.to_csv(f'{save_name}_sentences.csv')
        record_paragraphs.to_csv(f'{save_name}_paragraphs.csv')
        
    return (record_sentences, record_paragraphs)

In [170]:
sentences, records = create_refined_pairs(
    db = candidates,
    debug = False,
    save_name = "final_candidates",
    save = True
)

100%|█████████████████████████████████████████████████████████████████████████████████| 74647/74647 [32:53<00:00, 37.82it/s]


In [104]:
records

Unnamed: 0,caption,image
0,new small right fissural pleural effusion. no ...,/media/andrew/HD-GDU3/498/10000935_50578979_d0...
1,innumerable pulmonary metastases. possible mil...,/media/andrew/HD-GDU3/498/10000935_58219844_88...
2,right upper lobe pneumonia or mass. however gi...,/media/andrew/HD-GDU3/498/10000980_51967283_94...
3,the tip appears to project over the azygous ve...,/media/andrew/HD-GDU3/498/10001217_58913004_5e...
4,nasogastric tube extends to the mid body of th...,/media/andrew/HD-GDU3/498/10001401_50225296_00...
5,an enteric tube courses below the level of the...,/media/andrew/HD-GDU3/498/10001401_56534136_d6...
6,ng tube in expected position with tip coiled i...,/media/andrew/HD-GDU3/498/10001401_57492692_a8...
7,the endotracheal tube tip is cm above the cari...,/media/andrew/HD-GDU3/498/10001884_50376803_46...
8,mild pulmonary edema has not resolved. moderat...,/media/andrew/HD-GDU3/498/10001884_50712381_7b...
9,no acute intrathoracic process. the lungs are ...,/media/andrew/HD-GDU3/498/10001884_51181158_96...
