## Step 1: Import Libraries

In [1]:
import pandas as pd
import re

## Step 2: Import Dataset and Perform Initial Preprocessing

In [2]:
# Read in dataset from Excel
df = pd.read_excel('[Insert Your Directory Here]/[Insert File Name Here].xlsx')

# Convert ReportText to string and remove carriage returns and extra spacing between words
df['ReportText'] = df['ReportText'].astype('str')
df['ReportText'] = df['ReportText'].str.replace(r'\s+', ' ', regex=True).str.replace(r'\n+', ' ', regex=True)

# Convert PFT date column to date data type
df['pft_date'] = df['pft_date'].dt.date

## Step 3: Create snippet extraction function after identifying snippet in notes by checking a random sample of 100 notes with:

```
n = 1
for index,row in df.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note: {row['ReportText']}")
    print('-'*100)
    n+=1
```

In [4]:
# Function to create snippet based on template start phrases
def extract_fev1_context(text):
    pattern = re.compile(r'.Spirometry Results.{0,1000}', re.IGNORECASE)
    matches = pattern.findall(text)
    return ' '.join(matches)

## Step 4: Run the `ReportText` column through the function to generate a new `Snippet` column containing each note's snippet

In [5]:
df = df.copy(deep = True)
df['Snippet'] = df['ReportText'].apply(extract_fev1_context)

## Step 5: Initialize new dataframe containing only rows that have a snippet

In [7]:
notes_with_fev = df[df['Snippet'] != ''].reset_index(drop=True)

## Step 6: Initialize the PFT Classification Function

In [10]:
def classify_fev1(row):
    # Initialize variables
    fev1_abs_pre = []
    fev1_abs_post = []
    fev1_perc_pred_pre = []
    fev1_perc_pred_post = []
    fev1_fvc_pre = []
    fev1_fvc_post = []
    
    text = row['Snippet']
    
    # FEV1 abs pre BD absolute value matching pattern
    fev1_abs_pre_pattern = re.compile(r'Baseline.*?FEV-1.*?\d*\.\d+.*?(\d*\.\d+)', re.IGNORECASE)
    fev1_abs_pre_pattern_results = fev1_abs_pre_pattern.findall(text)
    
    if fev1_abs_pre_pattern_results:
        fev1_abs_pre.append(fev1_abs_pre_pattern_results[0])
        
    # FEV1 abs post BD absolute value matching pattern
    fev1_abs_post_pattern = re.compile(r'Post-Bronchodilator.*?FEV-1.*?\d*\.\d+.*?(\d*\.\d+)', re.IGNORECASE)
    fev1_abs_post_pattern_results = fev1_abs_post_pattern.findall(text)
    
    if fev1_abs_post_pattern_results:
        fev1_abs_post.append(fev1_abs_post_pattern_results[0])
        
        
    # FEV1 % pred pre BD (Baseline FEV-1,% Predicted (whole number, e.g., 68): 50)
    fev1_perc_pred_pre_pattern = re.compile(r'Baseline FEV-1\,\% Predicted.*?\d{2}.*?(\d{2})', re.IGNORECASE)
    fev1_perc_pred_pre_pattern_results = fev1_perc_pred_pre_pattern.findall(text)
    
    if fev1_perc_pred_pre_pattern_results:
        fev1_perc_pred_pre.append(fev1_perc_pred_pre_pattern_results[0])
        
    # FEV1 % pred post BD absolute value matching pattern
    fev1_perc_pred_post_pattern = re.compile(r'Post-Bronchodilator.*?FEV-1, % predicted.*?\d{2}.*?(\d{2})', re.IGNORECASE)
    fev1_perc_pred_post_pattern_results = fev1_perc_pred_post_pattern.findall(text)
    
    if fev1_perc_pred_post_pattern_results:
        fev1_perc_pred_post.append(fev1_perc_pred_post_pattern_results[0])
    
    # FEV1/FVC pre BD absolute value matching pattern
    fev1_fvc_pre_pattern = re.compile(r'Baseline.*?FEV-1/FVC.*?\d{2}.*?(\d{2})', re.IGNORECASE)
    fev1_fvc_pre_pattern_results = fev1_fvc_pre_pattern.findall(text)
    
    if fev1_fvc_pre_pattern_results:
        fev1_fvc_pre.append(fev1_fvc_pre_pattern_results[0])
        
    # FEV1/FVC post BD absolute value matching pattern
    fev1_fvc_post_pattern = re.compile(r'Post-Bronchodilator.*?FEV-1/FVC.*?\d{2}.*?(\d{2})', re.IGNORECASE)
    fev1_fvc_post_pattern_results = fev1_fvc_post_pattern.findall(text)
    
    if fev1_fvc_post_pattern_results:
        fev1_fvc_post.append(fev1_fvc_post_pattern_results[0])
    
    # Qualitative variables; hi  
    fev1_qual_hi = []
    fev1_qual_hi_pattern = re.compile(r'(no evidence for airflow obstruction)', re.IGNORECASE)                                  
    fev1_qual_hi_matches = fev1_qual_hi_pattern.findall(text)
    
    if fev1_qual_hi_matches:
        fev1_qual_hi.append(fev1_qual_hi_matches)
        
    # Qualitative variables low
    fev1_qual_lo = []
    fev1_qual_lo_pattern = re.compile(r'(mild obstruction|moderate obstruction|modeately severe obstruction|modeately severe airflow obstruction|moderately severe obstruction|severe obstruction|very severe obstruction)', re.IGNORECASE)
    fev1_qual_lo_matches = fev1_qual_lo_pattern.findall(text)
        
    if fev1_qual_lo_matches:
        fev1_qual_lo.append(fev1_qual_lo_matches)

    
    return pd.Series({'FEV1_Abs_Pre': fev1_abs_pre if fev1_abs_pre else None,
                      'FEV1_Abs_Post': fev1_abs_post if fev1_abs_post else None,
                      'FEV1_Perc_Pred_Pre': fev1_perc_pred_pre if fev1_perc_pred_pre else None,
                      'FEV1_Perc_Pred_Post': fev1_perc_pred_post if fev1_perc_pred_post else None,
                      'FEV1_FVC_Pre': fev1_fvc_pre if fev1_fvc_pre else None,
                      'FEV1_FVC_Post': fev1_fvc_post if fev1_fvc_post else None,
                      'FEV1_Qual_High': fev1_qual_hi if fev1_qual_hi else None,
                      'FEV1_Qual_Low': fev1_qual_lo if fev1_qual_lo else None
                     })
    

## Step 6: Run dataframe through the PFT extraction function

In [48]:
results = notes_with_fev.join(notes_with_fev.apply(classify_fev1, axis = 1))

## Step 7: Extract values from FEV1 % predicted, FEV1:FVC pre-BD, and qualitative variables.

In [50]:
def extract_value(nested_list):
    if nested_list is not None:
        return int(nested_list[0])

# Create new variables 'FEV1_Perc_Pred' and 'fev1_fvc' to hold extracted quantitative values
results['FEV1_Perc_Pred'] = results['FEV1_Perc_Pred_Pre'].apply(extract_value)
results['fev1_fvc'] = results['FEV1_FVC_Pre'].apply(extract_value)

def extract_fev1_qualitative(nested_list):
    if nested_list is not None:
        return str(nested_list[0][0])

# Create new variables to hold qualitative data
results['fev1_qual_neg'] = results['FEV1_Qual_Low'].apply(extract_fev1_qualitative)
results['fev1_qual_pos'] = results['FEV1_Qual_High'].apply(extract_fev1_qualitative)

## Step 8: Create mapping functions to map quantitative values to the standard clinical definitions of obstruction and severity of obstruction

In [52]:
#Create mapping function
def fev1_severity(value):
    if value >= 80:
        return "Normal"
    if 70 <= value <= 79:
        return "Mild"
    if 60 <= value <= 69:
        return "Moderate"
    if 50 <= value <= 59:
        return "Moderately Severe"
    if 35 <= value < 50:
        return "Severe"
    if value < 35:
        return "Very Severe"
    
def obstruction(value):
    if value >= 70:
        return "Normal"
    if value < 70:
        return "Reduced"

# Create new variables 'FEV1_Severity' and 'Obstruction' by running the FEV1 % predicted and FEV1:FVC variables through the mapping functions
results['FEV1_Severity'] = results['FEV1_Perc_Pred'].map(fev1_severity)
results['Obstruction'] = results['fev1_fvc'].map(obstruction)

## Step 9a: Impute FEV1 severity values from qualitative data

In [53]:
def fev1_severity_from_qual(row):
    if row['FEV1_Severity'] is None and row['fev1_qual_pos'] in ['Normal spirometry', 'normal spirometry']:
        return "Normal"
    if row['FEV1_Severity'] is None and row['fev1_qual_neg'] == 'Mild Obstruction':
        return "Mild"
    elif row['FEV1_Severity'] is None and  row['fev1_qual_neg'] == 'Moderate Obstruction':
        return "Moderate"
    elif row['FEV1_Severity'] is None and row['fev1_qual_neg'] in ['Moderately Severe Obstruction', 'Modeately severe obstruction', 'Modeately severe airflow obstruction']:
        return "Moderately Severe"
    elif row['FEV1_Severity'] is None and row['fev1_qual_neg'] not in ['Moderately Severe Obstruction', 'Modeately severe obstruction', 'Modeately severe airflow obstruction'] and row['fev1_qual_neg'] == 'Severe Obstruction':
        return "Severe"
    elif row['FEV1_Severity'] is None and row['fev1_qual_neg'] not in ['Moderately Severe Obstruction', 'Modeately severe obstruction', 'Modeately severe airflow obstruction'] and row['fev1_qual_neg'] != 'Severe Obstruction' and row['fev1_qual_neg'] == 'Very Severe Obstruction':
        return "Very Severe"
    else:
        return row['FEV1_Severity']

# If quantitative data is missing for FEV1 % predicted, use available qualitative data to map value
results['FEV1_Severity'] = results.apply(fev1_severity_from_qual, axis = 1)

## Step 9b: Impute obstruction values from qualitative data

In [54]:
def obstruction_from_qual(row):
    if row['Obstruction'] is None and row['fev1_qual_pos'] in ['No obstructive ventilatory defect', 'no obstructive ventilatory defect', 'Normal spirometry', 'No Obstruction', 'no obstruction', 'PFTs do not show obvious signs of obstructive disease']:
        return "Normal"
    if row['Obstruction'] is None and row['fev1_qual_neg'] is not None:
        return "Reduced"
    else:
        return row['Obstruction']
    
# If quantitative data is missing for Obstruction, use qualitative data to map value
results['Obstruction'] = results.apply(obstruction_from_qual, axis = 1)

## Step 10: Drop duplicate rows or rows missing extracted PFT data

In [1]:
# List columns to define on which variables you would like to drop duplicates
list_cols = ['Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_FVC_Post', 'FEV1_Perc_Pred_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# Drop rows that have no extracted PFT values
results = results.dropna(subset = list_cols, how = 'all')

# Convert columns in list_cols to string
for col in list_cols:
    results[col] = results[col].apply(lambda x: str(x))

# Drop duplicates of PFT results based on columns of interest + PatientID and PFT date
results = results.drop_duplicates(subset = ['PatientICN', 'pft_date', 'Obstruction','FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_FVC_Post', 'FEV1_Perc_Pred_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low'])

# Replace cells with 'None' values to empty string for ease of readability in the output Excel file
results.replace('None','',inplace = True)

## Step 11: Merge rows from same PFT with multiple notes containing values for different variables

In [56]:
# Define columns that will keep the max value if the two rows being merged have different values for.
columns_to_max = ['PatientSID', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_FVC_Post', 'FEV1_Perc_Pred_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# This function ensures that we don't lose one of the snippets upon merge, but rather append them together
def concatenate_strings(series):
    return ''.join(series.unique())

# Define aggregation function to keep the max value for columns that both have data across the rows
agg_funcs = {col: 'max' for col in columns_to_max}

# Create concatenated snippets for merged rows (instead of taking the "max" snippet value)
agg_funcs['Snippet'] = concatenate_strings

# Regenerate dataframe with collapsed rows for identical PFTs with multiple notes
results = results.groupby(['PatientICN','pft_date'], sort = False).agg(agg_funcs).reset_index()

In [60]:
# Define columns that will keep the max value if the two rows being merged have different values for.
columns_to_export = ['Snippet', 'PatientICN', 'PatientSID', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_FVC_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# This function ensures that we don't lose one of the snippets upon merge, but rather append them together
output_dir = '[Insert Directory Here]/'
file_name = '[Insert File Name Here].xlsx'
full_path = output_dir + file_name
to_export = results.sort_values(by=['PatientICN', 'pft_date'])

# Export data as .xslx file
to_export.to_excel(full_path, columns = columns_to_export, index = False)
