## Step 1: Import Packages

In [1]:
import re
import pandas as pd

## Step 2: Import Dataset and Perform Initial Preprocessing

In [2]:
# Import data from Excel to Pandas dataframe. Make sure to include PatientICN/Patientsid, PFT date, and ReportText columns
df = pd.read_excel('[Insert Your Directory Here/[Insert File Name Here].xlsx')

# Convert ReportText column to string and remove carriage returns and extra spaces between words
df['ReportText'] = df['ReportText'].astype('str')
df['ReportText'] = df['ReportText'].str.replace(r'\s+', ' ',regex=True).replace(r'\n+', ' ', regex=True)

## Step 3: Create Snippet Extraction Function after identifying template snippets with the following code:
```
n = 1
for index,row in df.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(f"Note Content: {row['ReportText']}")
    print(f"PFT Date: {row['pft_date']}")
    print('-'*100)
    n+=1
```

In [5]:
def extract_pft_context(text):
    pattern = re.compile(r'Lab Data.{0,1000}', re.IGNORECASE)
    matches = pattern.findall(text)
    return ''.join(matches)

## Step 4: Generate `Snippet` column by applying above function to the `ReportText` column

In [6]:
# Create a copy of the df to avoid warnings and run the ReportText column through the function to create the Snippet column
df['Snippet'] = df['ReportText'].apply(extract_pft_context)

# Create new dataframe where all rows with no snippet are dropped
notes_with_fev = df[df['Snippet'] != ''].reset_index(drop=True)

##### You can validate snippets with the following code:
```
n = 1
for index,row in notes_with_fev.sample(frac=1)[:100].iterrows():
    print(f"Row Number: {n}")
    print(row['ReportText'])
    print(row['pft_date'])
    print('-'*100)
    n+=1
```

## Step 5: Initialize PFT Classification Function

In [9]:
def classify_pft(text):
    fev1_abs_pre = []
    fev1_fvc = []
    fev1_perc_pred_pre = []

    
    
    # FEV1 abs pre
    fev1_abs_pre_pattern = re.compile(r'LAB DATA.*?FEV1.*?(\d*\.\d+)', re.IGNORECASE)
    fev1_abs_pre_pattern_results = fev1_abs_pre_pattern.findall(text)
    
    if fev1_abs_pre_pattern_results:
        fev1_abs_pre.append(fev1_abs_pre_pattern_results[0])
        
        
    # FEV1 perc pred pre 1
    fev1_perc_pred_pre_1_pattern = re.compile(r'Lab Data.*?FEV1.*?\d*\.\d+.*?\(?(\d{2,3})%?\)?', re.IGNORECASE)
    fev1_perc_pred_pre_1_pattern_results = fev1_perc_pred_pre_1_pattern.findall(text)
    
    if fev1_perc_pred_pre_1_pattern_results:
        fev1_perc_pred_pre.append(fev1_perc_pred_pre_1_pattern_results[0])
            
    # FEV1/FVC
    fev1_fvc_pattern = re.compile(r'FEV1/FVC.*?(\d{2,3})%?', re.IGNORECASE)
    fev1_fvc_pattern_results = fev1_fvc_pattern.findall(text)
    
    if fev1_fvc_pattern_results:
        fev1_fvc.append(fev1_fvc_pattern_results[0])
        
    return pd.Series({'FEV1_Abs_Pre': fev1_abs_pre if fev1_abs_pre else None,
                     'FEV1_Perc_Pred_Pre': fev1_perc_pred_pre if fev1_perc_pred_pre else None,
                     'FEV1_FVC_Pre': fev1_fvc if fev1_fvc else None
                     })

## Step 6: Run dataframe through the PFT extraction function

In [10]:
results = notes_with_pft.join(notes_with_pft['Snippet'].apply(classify_pft))

### Fix date column

In [11]:
results['pft_date'] = results['pft_date'].apply(lambda x: x.strftime('%Y-%m-%d'))

## Step 7: Extract values from FEV1 % predicted, FEV1:FVC pre-BD variables

In [12]:
def extract_perc_pred_value(nested_list):
    if nested_list is not None:
        return int(nested_list[0])
    
# Create new variable 'FEV1_Perc_Pred' to hold extracted quantitative value
results['FEV1_Perc_Pred'] = results['FEV1_Perc_Pred_Pre'].apply(extract_perc_pred_value)

def extract_fev1_fevc_value(nested_list):
    if nested_list is not None:
        return float(nested_list[0])

results['fev1_fvc'] = results['FEV1_FVC_Pre'].apply(extract_fev1_fevc_value)

In [13]:
def fev1_severity(value):
    if value >= 80:
        return "Normal"
    if 70 <= value <= 79:
        return "Mild"
    if 60 <= value <= 69:
        return "Moderate"
    if 50 <= value <= 59:
        return "Moderately Severe"
    if 35 <= value < 50:
        return "Severe"
    if value < 35:
        return "Very Severe"

def obstruction(value):
    if value >= 70:
        return "Normal"
    if value < 70:
        return "Reduced"
    
results['FEV1_Severity'] = results['FEV1_Perc_Pred'].map(fev1_severity)
results['Obstruction'] = results['fev1_fvc'].map(obstruction)

## Step 10: Drop duplicate rows and rows missing any extracted PFT data

In [14]:
# List columns to define on which variables to drop duplicates
list_cols = ['Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre']

# Drop rows that have no extracted PFT values
results = results.dropna(subset = list_cols, how='all')

# Convert data types of columns in list_cols to string
for col in list_cols:
    results[col] = results[col].apply(lambda x: str(x))
    
# Drop duplicates of PFT results based on columns of interest + PatientID and PFT date
results = results.drop_duplicates(subset=['PatientICN', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre'])

# Replace cells with 'None' values to empty string for ease of readability in the output Excel file
results.replace('None','',inplace=True)

1409
1323
1309


## Step 11: Collapse notes from same PFT with multiple notes containing values for different variables

In [15]:
# Define columns that will keep the max value if the two rows being merged have different values for.
columns_to_max = ['PatientSID', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre']

# This function ensures that we don't lose one of the snippets upon the merge butu rather append them together.
def concatenate_strings(series):
    return ''.join(series.unique())

# Define aggregation function to keep the  max value for columns that have different data in both rows
agg_funcs = {col: 'max' for col in columns_to_max}

# Create concatenated snippets for merged rows (instead of taking the "max" snippet value)
agg_funcs['Snippet'] = concatenate_strings

# Regenerate dataframe with the collapsed rows for identical PFTs with multiple notes
results = results.groupby(['PatientICN','pft_date'], sort = False).agg(agg_funcs).reset_index()

## Step 12: Export dataframe to Excel

In [18]:
# Select columns to export
columns_to_export = ['Snippet', 'PatientICN', 'PatientSID', 'pft_date', 'Obstruction', 'FEV1_Severity', 'FEV1_Abs_Pre', 'FEV1_Perc_Pred_Pre', 'FEV1_FVC_Pre']

# Define desired output directory, file name, and file path
output_dir = '[Insert Your Directory Here]/'
file_name = '[Insert File Name Here].xlsx'
full_path = output_dir + file_name

# Export data as .xslx file
results.to_excel(full_path, columns = columns_to_export, index=False)