# Preprocessing of clinical data

Preprocessing of the clinical data from the EHR (electronic health record).

## Data import

In [1]:
import pandas as pd

df = pd.read_csv("../../data/raw/ResectMap_DATA_2023-11-13_2306.csv")

# Cell to set the flag variable
execute_special_cell = False  # Set this to True to run the special cell

  df = pd.read_csv("../../data/raw/ResectMap_DATA_2023-11-13_2306.csv")


## Filter the data

1. Filter the dataframe for only the most recent information (redcap_repeat_instance == 1)
2. Check that there is no overlapping information per participant

In [2]:
# Add time-point 1 for the general information
df["redcap_repeat_instance"] = df["redcap_repeat_instance"].fillna(1)

# Only keep rows where with only 1 measurement per instrument
df_subset = df[df["redcap_repeat_instance"]==1]

columns_to_exclude = ['record_id', 'redcap_repeat_instrument', 'redcap_repeat_instance']

# Get the list of columns to check by excluding the columns_to_exclude
columns_to_check = [col for col in df_subset.columns if col not in columns_to_exclude]

# Group by 'record_id' and check which columns have more than 1 row of information for each participant
check_result = df_subset.groupby('record_id')[columns_to_check].apply(lambda x: x.columns[x.notnull().sum() > 1].tolist())

# Print the result for each participant
for record_id, problematic_columns in check_result.items():
    if problematic_columns:
        print(f"Participant {record_id} has more than 1 row of information in columns: {', '.join(problematic_columns)}")

del columns_to_exclude, columns_to_check, check_result, record_id, problematic_columns

## Flatten the dataframe

Flatten the dataframe from a long dataframe to a wide dataframe with only 1 particpant per row.

In [3]:
# Conditional cell execution
if execute_special_cell:
    
    # Group by 'record_id' and aggregate non-null values for each column
    wide_df = df_subset.groupby('record_id').agg(lambda x: x.dropna().iloc[0] if not x.dropna().empty else None)

    # Reset the index to get a clean DataFrame
    wide_df = wide_df.reset_index()

    # Now, consolidated_df contains one row per patient with non-null information
    print(wide_df)

    # Save dataframe for further use
    wide_df.to_pickle("../../data/processed/wide_df.pkl")

In [4]:
import pandas as pd

# Read dataframe from the pickle format
wide_df = pd.read_pickle("../../data/processed/wide_df.pkl")

## Check for wrong datatypes

In [5]:
wide_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677 entries, 0 to 676
Data columns (total 4677 columns):
 #     Column                               Non-Null Count  Dtype  
---    ------                               --------------  -----  
 0     record_id                            677 non-null    object 
 1     redcap_repeat_instrument             677 non-null    object 
 2     redcap_repeat_instance               677 non-null    float64
 3     mrn                                  677 non-null    float64
 4     study_site                           677 non-null    float64
 5     study_site_other                     0 non-null      object 
 6     ny_num_yn                            676 non-null    float64
 7     ny_num                               182 non-null    object 
 8     other_study_ids                      158 non-null    object 
 9     inc_crit                             677 non-null    float64
 10    dem_yob                              677 non-null    float64
 11    ag

## Remove unnecessary columns

In [6]:
columns_to_exclude = ["redcap_repeat_instrument", "redcap_repeat_instance", "mrn", "study_site_other", "ny_num_yn", "ny_num"]

###### IN PROGRESS ######

wide_df = wide_df.drop(columns_to_exclude, axis=1)

del columns_to_exclude

## Check for NAs

In [7]:
# Set threshold (e.g., 0.5 for 50%)
threshold = 0.9

# Calculate the NaN percentage for each column
nan_percentages = wide_df.isna().mean()

# Count the columns where the NaN percentage exceeds the threshold
columns_above_threshold = nan_percentages[nan_percentages > threshold].index.tolist()

# Get the count of columns above the threshold
count_above_threshold = len(columns_above_threshold)

print(f"Number of columns with NaN percentage above {threshold * 100}%: {count_above_threshold}")

del threshold, nan_percentages, count_above_threshold

Number of columns with NaN percentage above 90.0%: 3541


### Delete columns with NA % above threshold

In [8]:
wide_df = wide_df.drop(columns_above_threshold, axis=1)

## Add new features

### Number of measurements

Because the data is filtered for only "redcap_repeat_instance" == 1, we lose the information on other timepoints.
In order to capture that information, new variables/columns were created. These columns sum the number of measurement of each type per patient.

In [9]:
# Pivot the dataframe
pivoted_df = df.pivot_table(index='record_id', columns='redcap_repeat_instrument', values='redcap_repeat_instance', aggfunc='count')

# Fill NaN values with 0
pivoted_df.fillna(0, inplace=True)

# Display the resulting dataframe
print(pivoted_df.head())

# Merge the dataframes based on the 'record_id' column
wide_df = pd.merge(wide_df, pivoted_df, on='record_id', how='left')

del pivoted_df

redcap_repeat_instrument  eeg  engel_outcomes  mri  neuroanatomical_labeling  \
record_id                                                                      
RSCT000111                7.0             1.0  0.0                       0.0   
RSCT000208                5.0             1.0  0.0                       2.0   
RSCT000277                1.0             1.0  1.0                       1.0   
RSCT000508                0.0             1.0  0.0                       0.0   
RSCT000749                2.0             1.0  2.0                       2.0   

redcap_repeat_instrument  neuropsychological_testing  surgical_information  
record_id                                                                   
RSCT000111                                       2.0                   1.0  
RSCT000208                                       2.0                   8.0  
RSCT000277                                       1.0                   1.0  
RSCT000508                                       1.0  

### Age of seizure onset

We extract the age of seizure onset from a text column

In [10]:
import re

def extract_age_of_onset(text):
    age_pattern = re.compile(r'''
        (?:age\s*of\s*seizure\s*onset|age\s*at\s*seizure\s*onset|age\s*of\s*onset|
         seizure\s*onset\s*age|seizure\s*age\s*of\s*onset|seizure\s*onset|
         seizures*\s*began\s*(?:at|at\s*the\s*age\s*of)*|(?:1st|first)\s*seizure|
         onset\s*at\s*age|age\s*(?=:)|seizure\s*onset\s*:\s*age)\s*:*\s*~*\s*
        (([0-9,.]+(?:\s*(?:-|to|or)\s*[0-9,.]+)?\s*(?:y/o|years?|months?|days?|weeks?))|
         in-utero|birth|(\d+\s*\+\s*\d+\s*months?)|\d+|(?:\d+\s*(?:years?|yrs)\s*\d+\s*months)|(?<=\bage\s)\d+)  # age with optional units or range
    ''', re.IGNORECASE | re.VERBOSE)

    match = re.search(age_pattern, text)
    if match:
        age_with_units = match.group(1)
        return convert_units_to_years(age_with_units)

    # Handle specific phrases
    if 'first seizure at birth' in text.lower() or 'seizures began at birth' in text.lower():
        return 0

    return None


def convert_units_to_years(age_with_units):
    # Handle special cases for birth and in-utero
    if 'birth' in age_with_units.lower():
        return 0
    if 'in-utero' in age_with_units.lower():
        return 0

    # Handle combined years and months (e.g., "43yrs 11months")
    combined_pattern = re.match(r'(\d+)\s*(?:years?|yrs)\s*(\d+)\s*months?', age_with_units, re.IGNORECASE)
    if combined_pattern:
        years, months = map(int, combined_pattern.groups())
        return years + months / 12

    # Handle range of ages
    if '-' in age_with_units or 'to' in age_with_units or 'or' in age_with_units:
        numbers = [float(n) for n in re.findall(r'\d+(?:\.\d+)?', age_with_units)]
        if numbers:
            average_age = sum(numbers) / len(numbers)
            return average_age / 12 if 'month' in age_with_units.lower() else average_age
        else:
            return None

    # Regular age extraction
    match = re.match(r'(\d+(?:[.,]\d+)?)\s*(y/o|years?|months?|days?|weeks?|birth?|in-utero?)?', age_with_units, re.IGNORECASE)
    
    if match:
        value, unit = match.groups()
        if value:
            value = float(value)
            if unit:
                if 'month' in unit.lower():
                    return value / 12
                elif 'week' in unit.lower():
                    return value / 52.1775
                elif 'day' in unit.lower():
                    return value / 365.25
                else:  # Assume it's years if no unit is specified
                    return value
            else:  # No unit specified, assume years
                return value
        else:
            return None

    return None

In [11]:
# Apply the function to the 'doctor_notes' column and create a new 'age_of_onset' column
wide_df['age_of_onset'] = wide_df['seizure_class_notes'].apply(lambda x: extract_age_of_onset(str(x)) if pd.notna(x) else None)

result = wide_df[["age_of_onset", "seizure_class_notes"]]
result

Unnamed: 0,age_of_onset,seizure_class_notes
0,21.0,- Age of seizure onset: 21 y/o
1,3.0,- Age of seizure onset: 3 y/o; - Product of ...
2,,- 7/2007: experienced an episode of status ep...
3,20.0,- Age of seizure onset: 20 y/o.
4,,"- Reported triggers: exercise, dehydration, s..."
...,...,...
672,1.5,- Age of seizure onset: 18 months of age in t...
673,,"- Twin A of 2 twins, 36 weeks gestation; - L..."
674,,No information reported in Epic
675,35.0,- Age of seizure onset: 35 y/o; - Febrile se...


In [12]:
len(wide_df) - wide_df['age_of_onset'].isna().sum()

431

In [13]:
# Conditional cell execution
if execute_special_cell:

    age_of_onset = wide_df[["record_id", "age_of_onset", "seizure_class_notes"]]
    age_of_onset = age_of_onset.rename(columns={"age_of_onset": "age_of_onset_hw"})
    age_of_onset.to_csv("../../data/processed/age_of_onset_hw.csv")
    
else:
    # Read the hand-written DataFrame
    age_of_onset_df = pd.read_csv("../../data/processed/age_of_onset_hw_done.csv")

    # Merge the hand-written data with wide_df
    wide_df = pd.merge(wide_df, age_of_onset_df[['record_id', 'age_of_onset_hw']], on='record_id', how='left')

    print(len(wide_df) - wide_df['age_of_onset_hw'].isna().sum())

454


### Seizure frequency

We extract the seizure frequency from a text column

In [14]:
import pandas as pd
import re

def convert_to_monthly(freq_str, age):
    # Handling the case when the value is just '0'
    if freq_str.strip() == '0':
        return 0

    # Regular expression to extract numbers and unit, considering various formats
    match = re.match(r'(\d*\.?\d+)(?:\s*-\s*(\d*\.?\d+))?\s*(seizures?\/|seizures?\s+per\s+|per\s+|\/|\s+)?\s*(\w+)', freq_str, re.IGNORECASE)
    if match:
        num1, num2, _, unit = match.groups()
        num1 = float(num1)
        num2 = float(num2) if num2 else num1  # If no range, use the single number

        # Calculate the mean if there's a range
        number = (num1 + num2) / 2

        # Conversion rates to monthly frequency
        if unit in ['month', 'monthly', 'mo']:
            return number
        elif unit in ['day', 'daily']:
            return number * 30  # Approximate days in a month
        elif unit in ['week', 'weekly']:
            return number * 4.345  # Average weeks in a month
        elif unit in ['year', 'yearly']:
            return number / 12  # Months in a year
        elif unit == 'lifetime':
            if age > 0:
                # Convert lifetime frequency to monthly based on age
                return number / (age * 12)
            else:
                return None
        else:
            # Unknown unit
            return None
    else:
        # Pattern not matched
        return None

In [15]:
# Apply the conversion to each row and create a new column
wide_df['freq_per_month'] = wide_df.apply(lambda row: convert_to_monthly(row['seizure_freq'], row['age']), axis=1)

result = wide_df[["freq_per_month", "seizure_freq"]]
result

Unnamed: 0,freq_per_month,seizure_freq
0,2.0000,2/month
1,1680.0000,56/day
2,1.5000,1.5/month
3,28.0000,28/month
4,2.0000,2/month
...,...,...
672,30.6700,30.67/month
673,90.0000,3/day
674,,-
675,10.8625,2.5/week


In [16]:
len(wide_df) - wide_df['freq_per_month'].isna().sum()

571

In [17]:
# Conditional cell execution
if execute_special_cell:

    seizure_freq = wide_df[["record_id", "freq_per_month", "seizure_freq"]]
    seizure_freq = seizure_freq.rename(columns={"freq_per_month": "freq_per_month_hw"})
    seizure_freq.to_csv("../../data/processed/freq_per_month_hw.csv")
    
else:
    # Read the hand-written DataFrame
    seizure_freq_df = pd.read_csv("../../data/processed/freq_per_month_hw_done.csv")

    # Merge the hand-written data with wide_df
    wide_df = pd.merge(wide_df, seizure_freq_df[['record_id', 'freq_per_month_hw']], on='record_id', how='left')

    print(len(wide_df) - wide_df['freq_per_month_hw'].isna().sum())

571


### Date of last surgery

As technology changes over time, so will surgery outcome.
We extract the year of last surgery.

In [18]:
# Split the 'date' column and extract the year
wide_df['eo_year_surg'] = wide_df['eo_date_surg'].str.split('/').str[2]

# Convert the 'year' column to numeric (optional, if needed)
wide_df['eo_year_surg'] = pd.to_numeric(wide_df['eo_year_surg'])

wide_df['eo_year_surg'].head()

0    2015.0
1    2001.0
2    2012.0
3    2012.0
4    2007.0
Name: eo_year_surg, dtype: float64

## Scaling variables

In [19]:
# Before, 1 represented female and 2 represented male
# Now, 0 represents female and 1 represents male
wide_df["sex_gender"] = wide_df["sex_gender"].map({1: 0, 2: 1})

###### IN PROGRESS ######

# Scale age?

## Check for redundant columns

### Check for high correlation variables

In [20]:
# Calculate the correlation matrix
correlation_matrix = wide_df.select_dtypes(exclude=['object']).corr()

# Set a threshold for high correlation (e.g., 0.8 for 80%)
threshold = 0.8

# Find pairs of variables with a very high correlation
high_correlation_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            high_correlation_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j]))

# Print the pairs of variables with a very high correlation
print(f"Pairs of variables with correlation above {threshold}:")
for pair in high_correlation_pairs:
    print(pair)
    
del correlation_matrix, threshold, high_correlation_pairs, pair, i, j

Pairs of variables with correlation above 0.8:
('dem_yob', 'age')
('dem_yob', 'resect_pt_age')
('age', 'resect_pt_age')
('race___2', 'st_gen_motor_myotc')
('race___2', 'st_gen_motor_tonic')
('race___2', 'st_gen_motor_atonic')
('race___2', 'st_gen_motor_epispasms')
('lang1___1', 'lang1___2')
('lang2___3', 'lang2___4')
('lang2___5', 'st_gen_motor_myotc')
('lang2___5', 'st_gen_motor_tonic')
('lang2___5', 'st_gen_motor_atonic')
('lang2___5', 'st_gen_motor_epispasms')
('lang2___6', 'resect_multi_temp___1')
('lang2___6', 'resect_multi_occip___1')
('etoh_abuse', 'drug_abuse')
('neuro_exam_ptloc', 'phys_exam_ptloc')
('etio_struct_cva', 'st_gen_motor_clonic')
('etio_struct_cva', 'st_gen_motor_unspecified')
('etio_struct_iventhemo', 'st_gen_motor_clonic')
('etio_struct_iventhemo', 'st_gen_motor_unspecified')
('etio_struct_hypox_enceph', 'st_gen_motor_clonic')
('etio_struct_hypox_enceph', 'st_gen_motor_unspecified')
('etio_struct_dem', 'etio_struct_other_neuro')
('etio_struct_dem', 'st_focal_unkn

### Check for low variance

In [21]:
# Calculate the variance for each column
column_variances = wide_df.select_dtypes(exclude=['object']).var()

# Set a threshold for low variance
threshold = 0.1

# Find columns with a variance very close to 0
low_variance_columns = column_variances[column_variances < threshold]

# Print the count of columns with low variance and their names
count_low_variance_columns = len(low_variance_columns)
print(f"Number of columns with variance below {threshold}: {count_low_variance_columns}\n")

# Print the column names and their variances
print(f"Columns with variance below {threshold} (sorted by variance):")
for column, variance in low_variance_columns.sort_values().items():
    print(f"{column}: {variance}")
    
del threshold, low_variance_columns, count_low_variance_columns, column, variance

Number of columns with variance below 0.1: 445

Columns with variance below 0.1 (sorted by variance):
study_site: 0.0
st_unclassified___2: 0.0
st_unclassified___3: 0.0
st_unclassified___5: 0.0
st_unclassified___6: 0.0
resect_path_develop___9: 0.0
prev_aeds_name___4: 0.0
resect_path_infect___2: 0.0
prev_aeds_name___8: 0.0
resect_path_infect___1: 0.0
prev_aeds_name___19: 0.0
prev_aeds_name___22: 0.0
prev_aeds_name___24: 0.0
prev_aeds_name___26: 0.0
resect_path_tumor___4: 0.0
resect_path_les_vas___3: 0.0
epilepsy_history_complete: 0.0
resect_procedure___13: 0.0
resect_dx_electrodes___4: 0.0
resect_path_hpc___2: 0.0
resect_tbs_method_ver___4: 0.0
resect_atlp_location___7: 0.0
st_unclassified___1: 0.0
resect_amyhpc_app___1: 0.0
st_unkn_nonmotor___3: 0.0
st_unkn_epispasm___6: 0.0
st_unkn_gtc___2: 0.0
st_unkn_gtc___3: 0.0
st_unkn_gtc___4: 0.0
st_unkn_gtc___6: 0.0
st_unkn_motor___2: 0.0
st_unkn_motor___3: 0.0
st_unkn_motor___6: 0.0
st_unkn_clonic___2: 0.0
st_unkn_clonic___3: 0.0
st_unkn_clonic

In [22]:
# Count columns with a variance of 0
zero_variance_columns_count = (column_variances == 0).sum()

# Get the column names with a variance of 0
zero_variance_columns = column_variances[column_variances == 0].index.tolist()

# Print the count and names of columns with a variance of 0
print(f"Number of columns with a variance of 0: {zero_variance_columns_count}")
print(f"Columns with a variance of 0:")
for column in zero_variance_columns:
    print(column)
    
del zero_variance_columns_count, column_variances, column

Number of columns with a variance of 0: 119
Columns with a variance of 0:
study_site
demo_nyu_documents___1
demo_nyu_documents___2
demographics_complete
past_medical_history_complete
st_focal_bilat_tc___6
st_gen_motor___5
st_gen_motor___6
st_gen_nonmotor___3
st_gen_nonmotor___5
st_unkn_gtc___2
st_unkn_gtc___3
st_unkn_gtc___4
st_unkn_gtc___6
st_unkn_motor___2
st_unkn_motor___3
st_unkn_motor___6
st_unkn_clonic___2
st_unkn_clonic___3
st_unkn_clonic___6
st_unkn_tonic___2
st_unkn_tonic___4
st_unkn_tonic___6
st_unkn_myoclonic___3
st_unkn_myoclonic___4
st_unkn_myoclonic___6
st_unkn_atonic___3
st_unkn_atonic___4
st_unkn_atonic___6
st_unkn_epispasm___2
st_unkn_epispasm___3
st_unkn_epispasm___6
st_unkn_nonmotor___2
st_unkn_nonmotor___3
st_unkn_nonmotor___6
st_unclassified___1
st_unclassified___2
st_unclassified___3
st_unclassified___5
st_unclassified___6
prev_aeds_name___4
prev_aeds_name___8
prev_aeds_name___19
prev_aeds_name___22
prev_aeds_name___24
prev_aeds_name___26
epilepsy_history_complete

### Delete columns with a variance of 0

In [23]:
wide_df = wide_df.drop(zero_variance_columns, axis=1)

del zero_variance_columns

### Delete redundant columns

## Creating a single outcome variable

In [24]:
# Count rows where all surg_engel___1, 2, 3, 4 variables are 0
count_zero_rows = (wide_df[['surg_engel___1', 'surg_engel___2', 'surg_engel___3', 'surg_engel___4']] == 0).all(axis=1).sum()

# Display the count of rows with all 0 values
print("\nCount of rows with all 0 values:", count_zero_rows)


Count of rows with all 0 values: 99


## Remove rows with no engel outcomes (all columns with 0 values)

In [25]:
# Remove rows where all values are 0
wide_df = wide_df[~(wide_df[['surg_engel___1', 'surg_engel___2', 'surg_engel___3', 'surg_engel___4']] == 0).all(axis=1)]

# Count rows where all surg_engel___1, 2, 3, 4 variables are 0
count_zero_rows = (wide_df[['surg_engel___1', 'surg_engel___2', 'surg_engel___3', 'surg_engel___4']] == 0).all(axis=1).sum()

# Display the count of rows with all 0 values
print("\nCount of rows with all 0 values:", count_zero_rows)


Count of rows with all 0 values: 0


In [26]:
# Create 1 column out of the 4 dummy variables for engel outcome
wide_df['surg_engel'] = wide_df[['surg_engel___1', 'surg_engel___2', 'surg_engel___3', 'surg_engel___4']].idxmax(axis=1).str.split('___').str[1].astype(int)
wide_df[["record_id", "surg_engel",'surg_engel___1', 'surg_engel___2', 'surg_engel___3', 'surg_engel___4']].head()

Unnamed: 0,record_id,surg_engel,surg_engel___1,surg_engel___2,surg_engel___3,surg_engel___4
0,RSCT000111,2,0.0,1.0,0.0,0.0
1,RSCT000208,3,0.0,0.0,1.0,0.0
2,RSCT000277,4,0.0,0.0,0.0,1.0
3,RSCT000508,1,1.0,0.0,0.0,0.0
4,RSCT000749,2,0.0,1.0,0.0,0.0


In [27]:
# Save dataframe for further use
wide_df.to_csv("../../data/processed/preprocessed_df.csv")

## Print system information

In [28]:
import session_info

session_info.show()