## Step 1: Import Libraries

In [37]:
import pandas as pd
import re

## Step 2: Import Dataset and Perform Initial Preprocessing

In [19]:
# Read in dataset from Excel
df = pd.read_excel('[Insert Directory Here]/[Insert File Name Here].xlsx') # Portland, OR

# Convert ReportText to string and remove carriage returns and extra spacing between words
df['ReportText'] = df['ReportText'].astype('str')
df['ReportText'] = df['ReportText'].str.replace(r'\s+', ' ', regex=True).str.replace(r'\n+', ' ', regex=True)

# Convert PFT date column to date data type
df['pft_date'] = df['pft_date'].dt.date

## Step 3: Create snippet extraction function after identifying snippet in notes by checking a random sample of 100 notes with:

```
n = 1
for index,row in df.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note: {row['ReportText']}")
    print('-'*100)
    n+=1
```

In [21]:
# Function to create snippet based on template start phrase
def extract_fev1_context(text):
    pattern = re.compile(r'.{0,150}[Insert Your Snippet Here].{0,250}', re.IGNORECASE)
    matches = pattern.findall(text)
    return ' '.join(matches)

## Step 4: Run the `ReportText` column through the function to generate a new `Snippet` column containing each note's snippet

In [22]:
df = df.copy(deep = True)
df['Snippet'] = df['ReportText'].apply(extract_fev1_context)

## Step 5: Initialize new dataframe containing only rows that have a snippet

In [24]:
notes_with_fev = df[df['Snippet'] != ''].reset_index(drop=True)

##### You can validate snippets with the following code:
```
n = 1
for index,row in notes_with_fev.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note: {row['ReportText']}")
    print(row['Snippet'])
    print('-'*100)
    n+=1
```

## Step 6: Initialize the PFT Classification Function

In [27]:
def classify_fev1(row):
    # Initialize variables
    fev1_abs_post = []
    fev1_abs_pre = []
    fev1_fvc_pre = []
    fev1_fvc_post = []
    fev1_perc_predicted_post = []
    fev1_perc_predicted_pre = []
    
    text = row['Snippet']
    # Pred-pre Actual-pre %Pred-pre Actual-post %Chng SPIROMETRY FVC (L) 4.44 2.21 49 2.80 26 FEV1 (L) 3.43 0.67 19 0.77 15
    # FEV1 absolute pre BD 1
    fev1_vol_pre = re.compile(r'FEV1(?![/FVC])\s\(L\)\s\d*\.\d+\s(\d*\.\d+)', re.IGNORECASE)
    fev1_vol_results_pre = fev1_vol_pre.findall(text)
    
    if fev1_vol_results_pre:
        fev1_abs_pre.append(fev1_vol_results_pre)
        
    # FEV1 absolute post BD 1
    fev1_vol_post = re.compile(r'FEV1(?![/FVC])\s\(L\)\s\d*\.\d+\s\d*\.\d+\s\d{2,3}\s(\d*\.\d+)', re.IGNORECASE)
    fev1_vol_results_post = fev1_vol_post.findall(text)
    
    if fev1_vol_results_post:
        fev1_abs_post.append(fev1_vol_results_post)
        
    # Actual-pre LLN ZScore %Pred-pre Actual-post %Pred VolChg %Chg ... FEV1-L 1.74 2.52 -3.04 51 1.66 -0.08 49 -4
    # FEV1 abs value pre BD pattern 2
    fev1_vol_pre_2 = re.compile(r'FEV1-L\s(\d*\.\d+)', re.IGNORECASE)
    fev1_vol_results_pre_2 = fev1_vol_pre_2.findall(text)
    
    if fev1_vol_results_pre_2:
        fev1_abs_pre.append(fev1_vol_results_pre_2)
        
    # FEV1 abs value post BD 2
    fev1_vol_post_2 = re.compile(r'FEV1-L\s\d*\.\d+\s\d*\.\d+\s-?\d*\.\d+\s\d{2,3}\s(\d*\.\d+)', re.IGNORECASE)
    fev1_vol_results_post_2 = fev1_vol_post_2.findall(text)
    
    if fev1_vol_results_post_2:
        fev1_abs_post.append(fev1_vol_results_post_2)
        
    # FEV1/FVC pre BD 1 (Pred-pre Actual-pre %Pred-pre Actual-post)FEV1/FVC (%) 76 43 56 43
    fev1_fvc_pre_1 = re.compile(r'FEV1/FVC\s\(%\)\s\d{2}\s(\d{2})', re.IGNORECASE)
    fev1_fvc_pre_1_results = fev1_fvc_pre_1.findall(text)
    
    if fev1_fvc_pre_1_results:
        fev1_fvc_pre.append(fev1_fvc_pre_1_results)

    # FEV1/FVC post BD 1 (Pred-pre Actual-pre %Pred-pre Actual-post)FEV1/FVC (%) 76 43 56 43
    fev1_fvc_post_1 = re.compile(r'FEV1/FVC\s\(%\)\s\d{2}\s\d{2}\s\d{2}\s(\d{2})', re.IGNORECASE)
    fev1_fvc_post_1_results = fev1_fvc_post_1.findall(text)
    
    if fev1_fvc_post_1_results:
        fev1_fvc_post.append(fev1_fvc_post_1_results)
        
    # FEV1/FVC pre BD 2 (Actual-pre LLN ZScore %Pred-pre Actual-post %Pred VolChg %Chg FEV1/FVC-% 81 64 0.58 105 89 116 10)
    fev1_fvc_pre_2 = re.compile(r'FEV1/FVC-%\s(\d{2})', re.IGNORECASE)
    fev1_fvc_pre_2_results = fev1_fvc_pre_2.findall(text)
    
    if fev1_fvc_pre_2_results:
        fev1_fvc_pre.append(fev1_fvc_pre_2_results)
        
    # FEV1/FVC post BD 2 (Actual-pre LLN ZScore %Pred-pre Actual-post %Pred VolChg %Chg FEV1/FVC-% 81 64 0.58 105 89 116 10)
    fev1_fvc_post_2 = re.compile(r'FEV1/FVC-%\s\d{2}\s\d{2}\s-?\d*\.\d+\s\d{2,3}(\d{2})', re.IGNORECASE)
    fev1_fvc_post_2_results = fev1_fvc_post_2.findall(text)
    
    if fev1_fvc_post_2_results:
        fev1_fvc_post.append(fev1_fvc_post_2_results) 
    
    
    # FEV1 percent pred pre pattern 1 (Pred-pre Actual-pre %Pred-pre Actual-post %Chng FEV1 (L) 4.09 1.95 47 2.41 23)
    fev1_perc_pred_pre = re.compile(r'FEV1(?![/FVC])\s\(L\)\s\d*\.\d+\s\d*\.\d+\s(\d{2,3})', re.IGNORECASE)
    fev1_perc_pred_pre_results = fev1_perc_pred_pre.findall(text)

    if fev1_perc_pred_pre_results:
        fev1_perc_predicted_pre.append(fev1_perc_pred_pre_results)
        
    # FEV1 percent pred pre pattern 2 (Actual-pre LLN ZScore %Pred-pre Actual-post %Pred-post VolChg %Chg FEV1-L 2.31 3.32 -3.22 53 2.79 0.48 64 20)
    fev1_perc_pred_pre_2 = re.compile(r'FEV1-L\s\d*\.\d+\s\d*\.\d+\s-?\d*\.\d+\s(\d{2,3})', re.IGNORECASE)
    fev1_perc_pred_pre_results_2 = fev1_perc_pred_pre_2.findall(text)

    if fev1_perc_pred_pre_results_2:
        fev1_perc_predicted_pre.append(fev1_perc_pred_pre_results_2)
        
    # FEV1 percent pred post pattern (Actual-pre LLN ZScore %Pred-pre Actual-post %Pred-post VolChg %Chg FEV1-L 2.31 3.32 -3.22 53 2.79 0.48 64 20)
    fev1_perc_pred_post = re.compile(r'FEV1-L\s\d*\.\d+\s\d*\.\d+\s-?\d*\.\d+\s\d{2,3}\s(\d*\.\d+)', re.IGNORECASE)
    fev1_perc_pred_post_results = fev1_perc_pred_post.findall(text)

    if fev1_perc_pred_post_results:
        fev1_perc_predicted_post.append(fev1_perc_pred_post_results)  
    
    # Qualitative variables; hi  
    fev1_qual_hi = []
    fev1_qual_hi_pattern = re.compile(r'(FEV1 is normal|normal spirometry|spirometry is normal|normal pfts|no obstruction)', re.IGNORECASE)                                  
    fev1_qual_hi_matches = fev1_qual_hi_pattern.findall(text)
    
    for match in fev1_qual_hi_matches:
        if len(match) > 0:
            fev1_qual_hi.append(match)
    
    # Qualitative variables low: Q what to include in negative FEV descriptors (air trapping, dyspnea?)
    fev1_qual_lo = []
    fev1_qual_lo_pattern = re.compile(r'((?<!no )obstruction)', re.IGNORECASE)
    fev1_qual_lo_matches = fev1_qual_lo_pattern.findall(text)
        
    for match in fev1_qual_lo_matches:
        match = match.lower()
        if len(match) > 0 and match not in fev1_qual_lo:
            fev1_qual_lo.append(match)
    
    
    if len(fev1_qual_lo) != 0:
        fev1_qual_hi = []
    
    return pd.Series({'FEV1_Abs_Post': fev1_abs_post if fev1_abs_post else None,
                      'FEV1_Abs_Pre': fev1_abs_pre if fev1_abs_pre else None,
                      'FEV1_FVC_Pre': fev1_fvc_pre if fev1_fvc_pre else None,
                      'FEV1_FVC_Post': fev1_fvc_post if fev1_fvc_post else None,
                      'FEV1_Perc_Pred_Post': fev1_perc_predicted_post if fev1_perc_predicted_post else None,
                     'FEV1_Perc_Pred_Pre': fev1_perc_predicted_pre if fev1_perc_predicted_pre else None,
                      'FEV1_Qual_High': fev1_qual_hi if fev1_qual_hi else None,
                      'FEV1_Qual_Low': fev1_qual_lo if fev1_qual_lo else None})
    
        

## Step 6: Run dataframe through the PFT extraction function

In [58]:
results = notes_with_fev.join(notes_with_fev.apply(classify_fev1, axis = 1))

## Step 7: Extract values from FEV1 % predicted and FEV1:FVC pre-BD

In [60]:
def extract_value(nested_list):
    if nested_list is not None:
        return int(nested_list[0][0])

# Create new variables 'FEV1_Perc_Pred' and 'fev1_fvc' to hold extracted quantitative values
results['FEV1_Perc_Pred'] = results['FEV1_Perc_Pred_Pre'].apply(extract_value)
results['fev1_fvc'] = results['FEV1_FVC_Pre'].apply(extract_value)

## Step 8: Create mapping functions to map quantitative values to the standard clinical definitions of obstruction and severity of obstruction

In [61]:
def fev1_severity(value):
    if value >= 80:
        return "Normal"
    if 70 <= value <= 79:
        return "Mild"
    if 60 <= value <= 69:
        return "Moderate"
    if 50 <= value <= 59:
        return "Moderately Severe"
    if 35 <= value < 50:
        return "Severe"
    if value < 35:
        return "Very Severe"
    
def obstruction(value):
    if value >= 70:
        return "Normal"
    if value < 70:
        return "Reduced"

# Create new variables 'FEV1_Severity' and 'Obstruction' by running the FEV1 % predicted and FEV1:FVC variables through the mapping functions    
results['FEV1_Severity'] = results['FEV1_Perc_Pred'].map(fev1_severity)
results['Obstruction'] = results['fev1_fvc'].map(obstruction)

## Step 9: Drop duplicate rows or rows missing extracted PFT data

In [1]:
# List columns to define on which variables you would like to drop duplicates
list_cols = ['Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_Perc_Pred_Post', 'FEV1_FVC_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# Drop rows that have no extracted PFT values
results = results.dropna(subset = list_cols, how = 'all')

# Convert columns in list_cols to string
for col in list_cols:
    results[col] = results[col].apply(lambda x: str(x))

# Drop duplicates of PFT results based on columns of interest + PatientID and PFT date
results = results.drop_duplicates(subset = ['PatientICN', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_Perc_Pred_Post', 'FEV1_FVC_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low'])

# Replace cells with 'None' values to empty string for ease of readability in the output Excel file
results.replace('None','',inplace = True)

## Step 10: Merge rows from same PFT with multiple notes containing values for different variables

In [63]:
# Define columns that will keep the max value if the two rows being merged have different values for.
columns_to_max = ['PatientSID', 'Obstruction', 'FEV1_Severity','FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_Perc_Pred_Post', 'FEV1_FVC_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# This function ensures that we don't lose one of the snippets upon merge, but rather append them together
def concatenate_strings(series):
    return ''.join(series.unique())

# Define aggregation function to keep the max value for columns that both have data across the rows
agg_funcs = {col: 'max' for col in columns_to_max}

# Create concatenated snippets for merged rows (instead of taking the "max" snippet value)
agg_funcs['Snippet'] = concatenate_strings

# Regenerate dataframe with collapsed rows for identical PFTs with multiple notes
results = results.groupby(['PatientICN','pft_date'], sort = False).agg(agg_funcs).reset_index()

## Step 11: Export data to Excel for validation/analysis

In [67]:
# Select columns to export
columns_to_export = ['Snippet', 'PatientICN', 'PatientSID', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Abs_Post', 'FEV1_Perc_Pred_Post', 'FEV1_FVC_Post', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# Define desired output directory, file name, and file path
output_dir = '[Insert Your Directory Here]/'
file_name = '[Insert Filename Here].xlsx'
full_path = output_dir + file_name
to_export = results

# Export data as .xlsx file
to_export.to_excel(full_path, columns = columns_to_export, index = False)