# Data Summary + Analysis

Import libraries/packages + cleaned data

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.feature_lists import get_feature_lists

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats import chi2_contingency, fisher_exact, mannwhitneyu
from statsmodels.api import Logit
from statsmodels.tools import add_constant
import sigfig


from src.config import SEED
df = pd.read_parquet('../data/raw/Cleaned_ORN.parquet')

Classify features by data type

In [None]:
##Imported func from src
feature_lists = get_feature_lists()
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

# Train-Test split

In [None]:
#### General Split#######
#75-25 train-test split with stratified distribution of ORN to No-ORN
raw_X_train, raw_X_test, raw_y_train, raw_y_test = train_test_split(df.drop(['ORN'], axis = 1),
                                                    df['ORN'], 
                                                    test_size=0.25,
                                                    random_state = SEED, 
                                                    stratify=df.ORN
                                                    )
#Reset index
raw_X_train.reset_index(drop = True, inplace=True)
raw_y_train.reset_index(drop = True, inplace=True)
raw_X_test.reset_index(drop = True, inplace=True)
raw_y_test.reset_index(drop = True, inplace=True)
#Export raw train/test splits
raw_X_train.to_parquet('../data/raw/split/Raw_X_train.parquet')
raw_X_test.to_parquet('../data/raw/split/Raw_X_test.parquet')
raw_y_train.to_excel('../data/raw/split/Raw_y_train.xlsx', index = True)
raw_y_test.to_excel('../data/raw/split/Raw_y_test.xlsx', index = True)


In [None]:
print("Full train: ", raw_y_train.value_counts())
print(raw_X_train.shape)
print("Full test: ", raw_y_test.value_counts())
print(raw_X_test.shape)

# Summary across Train and Test sets

In [None]:
def generate_summary_column(df, ORN_type, all_categories):
    total_entries = len(df)
    header = f'{ORN_type} (n={total_entries})'
    summary_list = []
    for col in df:
        if col == 'ORN':
            continue
        elif col in binary_cols + nominal_cols + ordinal_cols:
            counts = df[col].value_counts(dropna=False)
            percentages = np.round(df[col].value_counts(normalize=True, dropna=False) * 100, 1)
            summary_list.append({
                'Feature': col.upper(),
                header: ''
            })
            for entry in all_categories[col]:
                summary_list.append({
                    'Feature': f'{col.upper()} {entry}',
                    header: f'{counts.get(entry, 0)} ({percentages.get(entry, 0.0)})'
                })
            # Add missing value row
            n_missing = df[col].isnull().sum()
            pct_missing = np.round(n_missing / total_entries * 100, 1)
            summary_list.append({
                'Feature': f'{col.upper()} Missing',
                header: f'{n_missing} ({pct_missing})'
            })
        else:
            # Numerical: mean and SD on non-missing, plus missing count/%
            non_missing = df[col].dropna()
            avg = np.round(non_missing.mean(), 1)
            std = np.round(non_missing.std(), 1)
            n_missing = df[col].isnull().sum()
            pct_missing = np.round(n_missing / total_entries * 100, 1)
            summary_list.append({
                'Feature': f'{col.upper()}, ± (SD)',
                header: f'{avg} ± {std}'
            })
            summary_list.append({
                'Feature': f'{col.upper()} Missing',
                header: f'{n_missing} ({pct_missing})'
            })
    return pd.DataFrame(summary_list)

def generate_summary_table(X_df, y_df,all_categories):
    df = pd.concat([X_df, y_df], axis = 1) #Get merged df with ORN column
    non_orn_df = df[df['ORN'] == 0] 
    orn_df = df[df['ORN'] == 1] 
    
    #Get total, non-orn, and orn summary tables
    total_col = generate_summary_column(df, 'Total', all_categories)
    non_orn_col = generate_summary_column(non_orn_df, 'Non-ORN', all_categories)
    orn_col = generate_summary_column(orn_df, 'ORN', all_categories)

    #Drop feature column for readability when combined
    orn_col.drop('Feature', axis = 1, inplace = True)
    non_orn_col.drop('Feature', axis = 1, inplace = True)

    #Combine total, non-orn, and orn columns
    combined_df = pd.concat([total_col, non_orn_col], axis =1)
    combined_df = pd.concat([combined_df, orn_col], axis =1)
    return combined_df

#### Get all unique entries for each categorical variable
all_categories = {}
for col in nominal_cols + binary_cols + ordinal_cols:
    train_categories = raw_X_train[col].unique()
    test_categories = raw_X_test[col].unique()
    all_categories[col] = np.union1d(train_categories, test_categories)


# Generate summary tables for training and testing data
train_summary = generate_summary_table(raw_X_train, raw_y_train, all_categories)
test_summary = generate_summary_table(raw_X_test, raw_y_test, all_categories)
test_summary.drop('Feature', axis = 1, inplace = True)

# Combine the two tables side by side
combined_summary = pd.concat([train_summary.add_suffix(' (Train)'), test_summary.add_suffix(' (Test)')], axis=1)

# Display the combined summary table
display(combined_summary)
combined_summary.to_excel('../results/tables/SummaryTable.xlsx' ,index=False)

# Univariable Analysis across ORN and non-ORN cohorts

Impute

In [None]:
imputer = IterativeImputer(random_state=SEED, sample_posterior=True)
df_impute = df.copy()
imputed_values = imputer.fit_transform(df_impute[numerical_cols])
df_impute[numerical_cols] = imputed_values

Check if expected frequency <= 5 --> if so, use Fishers Exact instead of Chi-Squared

In [None]:
fish_list = []
print('-'*40)
print('Relevant Contingency Tables:')
for col in df_impute:
    if col in binary_cols:
        contingency_table = pd.crosstab(df_impute[col], df_impute['ORN'])
        _, _, _, expected_frequencies = chi2_contingency(contingency_table)
        if (expected_frequencies < 5).any(): #True when at least 1 val is < 5
            fish_list.append(col)
            print(col)
            print('-'*5)
            print(expected_frequencies)
print('-'*40)
print('Features with an expected frequency <5 :')
print(fish_list)
print('-'*40)

In [None]:
##Get summary values to append to analysis values
summary_df = generate_summary_table(df_impute.drop('ORN', axis = 1), df_impute.ORN, all_categories)
summary_df.set_index('Feature', inplace=True)

In [None]:
def get_odds_w_contingency(contingency_table):
    # Extract cell values
    a = contingency_table.loc[1, 1] if 1 in contingency_table.index and 1 in contingency_table.columns else 0
    b = contingency_table.loc[1, 0] if 1 in contingency_table.index and 0 in contingency_table.columns else 0
    c = contingency_table.loc[0, 1] if 0 in contingency_table.index and 1 in contingency_table.columns else 0
    d = contingency_table.loc[0, 0] if 0 in contingency_table.index and 0 in contingency_table.columns else 0
    
    #Continuity correction
    if a == 0 or b == 0 or c == 0 or d == 0:
        a, b, c, d = a + 0.5, b + 0.5, c + 0.5, d + 0.5
    odds_ratio = (a*d) / (b*c)
    log_or = np.log(odds_ratio)
    se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
    z = 1.96 #For 95% CI
    ci_lower = np.exp(log_or - z * se_log_or)
    ci_upper = np.exp(log_or + z * se_log_or)
    return odds_ratio, ci_lower, ci_upper

def format_p_val(p_val):
    ### Reformat p-val with string if low enough, otherwise round ###
    if p_val < .0001:
        return '<0.0001'
    elif p_val < 0.05:
        return sigfig.round(p_val, sigfigs=2)
    else:
        return round(p_val, 1)

results = []
for col in df_impute.drop('ORN', axis = 1): #Loop through all columns except ORN
    ########## Binary Columns ##########
    if col in binary_cols: 
        ### p-vals ###
        contingency_table = pd.crosstab(df_impute[col], df_impute['ORN'])
        if col in fish_list:
        ## Fishers Exact for p-vals if expected freq < 5 
            _, p_value = fisher_exact(contingency_table)
        ## Otherwise chi-squared test 
        else: 
            _, p_value, _, _ = chi2_contingency(contingency_table)
        p_value = format_p_val(p_value)
        ### Odds Ratios ###
        odds_ratio, ci_lower, ci_upper = get_odds_w_contingency(contingency_table)
        odds_conf = f'{odds_ratio:.2f} ({ci_lower:.2f}, {ci_upper:.2f})'
        results.append((col, odds_conf, p_value))  
    ########## Numerical Columns ##########
    elif col in numerical_cols:
    ### Mann-Whitney U test for p-vals ###
        group1 = df_impute[df_impute['ORN'] == 0][col]
        group2 = df_impute[df_impute['ORN'] == 1][col]
        stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
        p_value = format_p_val(p_value)
    ### Logit for ORs and CIs ###
        model = Logit(df_impute['ORN'], add_constant(df_impute[col])).fit(disp=0)  
        odds_ratio = np.exp(model.params[col])                         
        conf_int = model.conf_int().loc[col].apply(np.exp)      
        ci_lower = conf_int[0]
        ci_upper = conf_int[1]
        odds_conf = f'{odds_ratio:.2f} ({ci_lower:.2f}, {ci_upper:.2f})'
        results.append((col.upper() + ', ± (SD)', odds_conf, p_value)) 
    ########## Nominal + Ordinal Columns ##########
    elif col in nominal_cols + ordinal_cols:
        # Get list of entries and value counts
        entries = sorted(df_impute[col].unique()) 
        val_counts = df_impute[col].value_counts()
        #Create subset onehot-encoded temporary df 
        temp_df = df_impute[[col, 'ORN']]
        temp_df = pd.get_dummies(temp_df, columns=[col], drop_first=False, dtype=int)
        #### Nominal ####
        if col in nominal_cols:
            # drop the entry with the highest freq and make that reference
            max_freq_idx = val_counts.idxmax() #Entry with the highest frequency
            temp_df.drop(f'{col}_{max_freq_idx}', axis = 1, inplace = True)
        ### Ordinal ###
        else: 
            # drop the entry with the lowest value and make that reference
             temp_df.drop(f'{col}_{entries[0]}', axis = 1, inplace = True)
            ### Logit for p-vals, ORs, and CIs ###
        X = temp_df.drop('ORN', axis = 1)
        y = temp_df['ORN']
        X = add_constant(X)
        model = Logit(y, X).fit(disp=0)
        odds_ratios = np.exp(model.params) 
        conf_int = np.exp(model.conf_int())
        p_values = model.pvalues
        #Add variable header to df
        results.append((col, '', ''))

        # Loop through each possible entry of the feature
        for entry in entries: 
            entry_name = f'{col}_{entry}' #Reformat to allow for indexing ex) 1.0 --> SITE_1.0
            ### Nominal ###
            if col in nominal_cols and entry == max_freq_idx: #Make highest freq entry the reference
                results.append((f'{col} {entry}','Reference', 'Reference'))
            ### Ordinal ###
            elif col in ordinal_cols and entry == entries[0]: #Make lowest value entry the reference
                results.append((f'{col} {entry}', 'Reference', 'Reference'))
            #If not reference, get stat vals
            else: 
                p_val_specific = format_p_val(p_values[entry_name])
                odds_ratio = odds_ratios.loc[entry_name]
                ci_lower = conf_int.loc[entry_name, 0]
                ci_upper = conf_int.loc[entry_name, 1]
                results.append((f'{col} {entry}', f'{odds_ratio:.2f} ({ci_lower:.2f}, {ci_upper:.2f})', p_val_specific))

results_df = pd.DataFrame(results, columns=['Feature', 'Odds Ratio (95% CI)', 'P-Value',])
results_df.set_index('Feature', inplace=True)
final_table = summary_df.join(results_df, how='left').fillna('')
display(final_table)
final_table.to_excel('../results/tables/AnalysisTable.xlsx')