## Step 1: Import packages

In [1]:
import pandas as pd
import re

## Step 2: Import dataset and perform initial preprocessing steps

In [2]:
# Read in dataset from Excel
df = pd.read_excel('[Insert Your Directory Here]/[Insert Your File Name Here].xlsx')

# Convert ReportText to string and remove carriage returns and extra spacing between words
df['ReportText'] = df['ReportText'].astype('str')
df['ReportText'] = df['ReportText'].str.replace(r'\s+', ' ', regex=True).str.replace(r'\n+', ' ', regex=True)

# Convert PFT date column to date data type
df['pft_date'] = df['pft_date'].dt.date

## Step 3: Create snippet extraction function after identifying snippet in notes by checking a random sample of 100 notes with:

```
n = 1
for index,row in df.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note: {row['ReportText']}")
    print('-'*100)
    n+=1
```

In [4]:
# Function to create snippet based on template start phrase
def extract_fev1_context(text):
    pattern = re.compile(r'.Pulmonary Function Test Spirometry.{0,500}', re.IGNORECASE)
    matches = pattern.findall(text)
    return ' '.join(matches)

## Step 4: Run the `ReportText` column through the function to generate a new `Snippet` column containing each note's snippet

In [5]:
df = df.copy(deep = True)
df['Snippet'] = df['ReportText'].apply(extract_fev1_context)

## Step 5: Initialize new dataframe containing only rows that have a snippet

In [7]:
notes_with_fev = df[df['Snippet'] != ''].reset_index(drop=True)

##### You can validate snippets with the following code:
```
n = 1
for index,row in notes_with_fev.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note: {row['ReportText']}")
    print(row['Snippet'])
    print('-'*100)
    n+=1
```

## Step 6: Initialize the PFT Classification Function

In [11]:
def classify_fev1(row):
    
    # Initialize variables
    fev1_abs = []
    fev1_perc_pred = []
    fev1_fvc = []
    
    text = row['Snippet']
    
    # FEV1 abs pre BD absolute value
    fev1_abs_pattern = re.compile(r'FEV1.*?(\d*\.\d+)', re.IGNORECASE)
    fev1_abs_pattern_results = fev1_abs_pattern.findall(text)
    
    if fev1_abs_pattern_results:
        fev1_abs.append(fev1_abs_pattern_results[0])
    
    # FEV1 perc pred pre BD absolute value
    fev1_perc_pred_pattern = re.compile(r'FEV1.*?\d*\.\d+.*?(\d{2,3})%', re.IGNORECASE)
    fev1_perc_pred_pattern_results = fev1_perc_pred_pattern.findall(text)
    
    if fev1_perc_pred_pattern_results:
        fev1_perc_pred.append(fev1_perc_pred_pattern_results[0])
        
    # FEV1 FVC pre BD absolute value
    fvc_abs_pattern = re.compile(r'FVC.*?(\d*\.\d+)', re.IGNORECASE)
    fvc_abs_pattern_results = fvc_abs_pattern.findall(text)
    
    if len(fvc_abs_pattern_results)>0:
        if len(fev1_abs_pattern_results)>0:
            fev1_fvc.append(round(float(fev1_abs_pattern_results[0])/float(fvc_abs_pattern_results[0]), 2)*100)
        
    return pd.Series({'FEV1_Abs_Pre': fev1_abs if fev1_abs else None,
                      'FEV1_Perc_Pred_Pre': fev1_perc_pred if fev1_perc_pred else None,
                      'FEV1_FVC_Pre': fev1_fvc if fev1_fvc else None,
                      'FEV1_Qual_High': fev1_qual_hi if fev1_qual_hi else None,
                      'FEV1_Qual_Low': fev1_qual_lo if fev1_qual_lo else None
                     })

## Step 6: Run dataframe through the PFT extraction function

In [12]:
results = notes_with_fev.join(notes_with_fev.apply(classify_fev1, axis = 1))

## Step 7: Extract values from FEV1 % predicted and FEV1:FVC pre-BD

In [14]:
def extract_value(nested_list):
    if nested_list is not None:
        return int(nested_list[0])
    
# Create new variables 'FEV1_Perc_Pred' and 'fev1_fvc' to hold extracted quantitative values
results['FEV1_Perc_Pred'] = results['FEV1_Perc_Pred_Pre'].apply(extract_value)
results['fev1_fvc'] = results['FEV1_FVC_Pre'].apply(extract_value)

## Step 8: Create mapping functions to map quantitative values to the standard clinical definitions of obstruction and severity of obstruction

In [15]:
def fev1_severity(value):
    if value >= 80:
        return "Normal"
    if 70 <= value <= 79:
        return "Mild"
    if 60 <= value <= 69:
        return "Moderate"
    if 50 <= value <= 59:
        return "Moderately Severe"
    if 35 <= value < 50:
        return "Severe"
    if value < 35:
        return "Very Severe"
    
def obstruction(value):
    if value >= 70:
        return "Normal"
    if value < 70:
        return "Reduced"

# Create new variables 'FEV1_Severity' and 'Obstruction' by running the FEV1 % predicted and FEV1:FVC variables through the mapping functions
results['FEV1_Severity'] = results['FEV1_Perc_Pred'].map(fev1_severity)
results['Obstruction'] = results['fev1_fvc'].map(obstruction)

## Step 9: Drop duplicate rows or rows missing extracted PFT data

In [1]:
# List columns to define on which variables you would like to drop duplicates
list_cols = ['Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# Drop rows that have no extracted PFT values
results = results.dropna(subset = list_cols, how = 'all')

# Convert columns in list_cols to string
for col in list_cols:
    results[col] = results[col].apply(lambda x: str(x))

# Drop duplicates of PFT results based on columns of interest + PatientID and PFT date
results = results.drop_duplicates(subset = ['PatientICN', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Qual_High', 'FEV1_Qual_Low'])

# Replace cells with 'None' values to empty string for ease of readability in the output Excel file
results.replace('None','',inplace = True)

## Step 10: Merge rows from same PFT with multiple notes containing values for different variables

In [17]:
# Define columns that will keep the max value if the two rows being merged have different values for.
columns_to_max = ['PatientSID', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# This function ensures that we don't lose one of the snippets upon merge, but rather append them together
def concatenate_strings(series):
    return ''.join(series.unique())

# Define aggregation function to keep the max value for columns that both have data across the rows
agg_funcs = {col: 'max' for col in columns_to_max}

# Create concatenated snippets for merged rows (instead of taking the "max" snippet value)
agg_funcs['Snippet'] = concatenate_strings

# Regenerate dataframe with collapsed rows for identical PFTs with multiple notes
results = results.groupby(['PatientICN','pft_date'], sort = False).agg(agg_funcs).reset_index()

## Step 11: Export data to Excel for validation/analysis

In [21]:
# Select columns to export
columns_to_export = ['Snippet', 'PatientICN', 'PatientSID', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre', 'FEV1_Qual_High', 'FEV1_Qual_Low']

# Define desired output directory, file name, and file path
output_dir = '[Insert Your Directory Here]/'
file_name = '[Insert File Name Here].xlsx'
full_path = output_dir + file_name
to_export = results

# Export data as .xlsx file
to_export.to_excel(full_path, columns = columns_to_export, index = False)