In [1]:
# Specify where you want to save your results to
out_dir = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/atrophy_seeds_2023/Figures/supplement_parametric_regression_to_adascog'

Import Data

In [2]:
# Specify the path to your CSV file containing NIFTI paths
input_csv_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/cognition_2023/metadata/master_list_proper_subjects.xlsx'
sheet = 'master_list_proper_subjects' # 'master_list_proper_subjects'

In [None]:
from calvin_utils.permutation_analysis_utils.statsmodels_palm import CalvinStatsmodelsPalm
# Instantiate the PalmPrepararation class
cal_palm = CalvinStatsmodelsPalm(input_csv_path=input_csv_path, output_dir=out_dir, sheet=sheet)
# Call the process_nifti_paths method
data_df = cal_palm.read_and_display_data()


# 01 - Preprocess Your Data

**Handle NANs**
- Set drop_nans=True is you would like to remove NaNs from data
- Provide a column name or a list of column names to remove NaNs from

In [None]:
data_df.columns

In [5]:
drop_list = ['Age', 'Z_Scored_Percent_Cognitive_Improvement', 'Hippocampus_GM_Vol', 'Z_Scored_Subiculum_T_By_Origin_Group_']

In [None]:
data_df = cal_palm.drop_nans_from_columns(columns_to_drop_from=drop_list)
data_df

**Drop Row Based on Value of Column**

Define the column, condition, and value for dropping rows
- column = 'your_column_name'
- condition = 'above'  # Options: 'equal', 'above', 'below'

In [None]:
data_df.columns

Set the parameters for dropping rows

In [8]:
column = 'City'  # The column you'd like to evaluate
condition = 'equal'  # The condition to check ('equal', 'above', 'below', 'not')
value = 'Boston' # The value to drop if T

In [None]:
data_df, other_df = cal_palm.drop_rows_based_on_value(column, condition, value)
display(data_df)

**Standardize Data**
- Enter Columns you Don't want to standardize into a list

In [10]:
# Remove anything you don't want to standardize
# cols_not_to_standardize = ['Ordinal_Target_Type', 'Ordinal_Epilepsy_Type'] #['Age']

In [11]:
# data_df = cal_palm.standardize_columns(cols_not_to_standardize)
# data_df

# Mediated Moderation at any/all points along the path

**1) Total Mediated Moderation**

**Edwards and Lambert 2007, DOI: 10.1037/1082-989X.12.1.1**

Author - Calvin Howard


Model Structure:
```
               IV ------------------------> Outcome
             ->|                         | <-------Moderator
            |   --------MEDIATOR---------              |
            |              ^                           |
            |              |                           |
             ------------------------------------------
```
____
- Direct
  - IV to Outcome (Y = B1 + B2X + B3Z + B4XZ + error_B) 
- Indirect
  - IV through mediator to outcome 
    - First stage (Y = B1 + B2X + B3M + B4XM + B5Z + error_B) 
      - Where M = A0 + A1X + A2Z  + A3XZ + error_A
    - Second Stage (Y = B1 + B2X + B3M + B4Z + B5ZM + error_B) 
      - Where M = A0 + A1X
- Total
  - Inlfuence of IV on Outcome through direct and indirect (all paths)
    - Combined accounting of all equations (EQUATION 1)
      - Y = B1 + B2X + B3(A0 + A1X + A2Z  + A3XZ + error_A) + B4X(A0 + A1X + A2Z  + A3XZ + error_A) + B5Z + B6XZ + error_B
- Moderation
  - Is occuring at the first stage (IV->M)
  - Is occuring at the second stage (M->Y)
  - Is occuring at the direct stage (IV->Y)
____
- Estimation of Effects, from estimates in EQUATION 1
  - Direct = B0 + B1Z
  - Indirect = (A1 + A2Z)(B3 + B4Z)
  - Total = B0 + B1Z + (A1 + A2Z)(B3 + B4Z)
____
- Estimation of Significance
  - Resampled bootstrap
  - p = percentage of times bootstrap was opposite sign of the mean bootstrapped value 

In [17]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from tqdm import tqdm

class MediatedModerationAnalysis:
    """
    A class for performing mediated moderation analysis with flexibility to handle any 
    combination of moderation at different stages of the mediation process, including cases with no moderation (simple mediation). 
    This class is designed based on the framework provided by Edwards and Lambert (2007), 
    allowing researchers and students to analyze complex relationships between variables with clarity and precision.

    **How It Works:**
    The model analyzes effects in this DAG, where you control where the moderator is applied. 
    
                            ----Direct Moderation----------------
                        |                                     |
                        v                                     |
            IV -----------------------> DV                     |
            |                         ^                       |
            -->|                         | <---Second Stage----Moderator
        |    -------Mediator----------                        |
        |                                                     |
        |                                                     |
            ---------------------First Stage---------------------

    - **Model Fitting:**
        - **First-Stage Model (Mediator Model):**
            - Predicts the mediator (`M`) from the independent variable (`X`).
            - Includes an interaction term between `X` and the moderator (`Z`) if `moderate_stage_1` is `True`, allowing for moderation at the first stage (X → M).
        - **Outcome Model:**
            - Predicts the dependent variable (`Y`) from `X`, `M`, and `Z`.
            - Includes interaction terms between `M` and `Z` and/or `X` and `Z` based on the `moderate_stage_2` and `moderate_direct` flags, allowing for moderation at the second stage (M → Y) and moderation of the direct effect (X → Y), respectively.

    - **Effect Computation:**
        - **Direct Effect:** The effect of `X` on `Y`, accounting for any moderation specified.
        - **Indirect Effect:** The effect of `X` on `Y` through `M`, considering moderation at the first and second stages.
        - **Total Effect:** The sum of the direct and indirect effects.
        - Effects can be computed at a specific value of the moderator (`moderator_value`), as recommended by Edwards and Lambert (2007). If `moderator_value` is `None`, default coefficients are used.

    - **Bootstrapping:**
        - Performs resampling with replacement to create bootstrap samples.
        - For each bootstrap sample, fits the specified models and computes the effects.
        - Estimates confidence intervals and p-values for the effects based on the distribution of bootstrap estimates.

    **Parameters:**
    - **dataframe (pd.DataFrame):** The dataset containing all variables required for the analysis.
    - **iv (str):** Name of the independent variable (`X`).
    - **mediator (str):** Name of the mediator variable (`M`).
    - **moderator (str):** Name of the moderator variable (`Z`).
    - **dv (str):** Name of the dependent variable (`Y`).
    - **moderate_stage_1 (bool):** Indicates whether to include moderation at the first stage (X → M). Default is `False`.
    - **moderate_stage_2 (bool):** Indicates whether to include moderation at the second stage (M → Y). Default is `False`.
    - **moderate_direct (bool):** Indicates whether to include moderation of the direct effect (X → Y). Default is `False`.
    - **moderator_value (float, optional):** The specific value of the moderator (`Z`) at which to evaluate the effects. According to Edwards and Lambert (2007), setting this value allows testing effects at particular levels of `Z`. If `None`, effects are computed without specifying a `Z` value, typically using mean values or default coefficients.

    **Usage Example:**

    ```python
    # Initialize the analysis with desired settings
    analysis = MediatedModerationAnalysis(
        dataframe=data_df,
        iv='Age',
        mediator='Entorhinal_Cortex_GM_Vol',
        moderator='Z_Scored_Subiculum_Connectivity_T',
        dv='Z_Scored_Percent_Cognitive_Improvement',
        moderate_stage_1=True,
        moderate_stage_2=False,
        moderate_direct=True,
        moderator_value=0  # Example value for Z
    )

    # Fit the first-stage (mediator) model
    analysis.fit_first_stage()

    # Fit the outcome model
    analysis.fit_outcome_model()

    # Compute the effects
    effects = analysis.compute_effects()
    print("Computed Effects:", effects)

    # Perform bootstrapping to estimate confidence intervals and p-values
    analysis.bootstrap(n_bootstraps=10000)

    # Display a summary of the bootstrap results
    summary_df = analysis.summary()
    print(summary_df)
    ```

    **Methods:**

    - **fit_first_stage():**
        - Fits the mediator model based on the specified moderation at the first stage.
        - Updates internal coefficients used in effect computation.

    - **fit_outcome_model():**
        - Fits the outcome model based on the specified moderation at the second stage and/or direct effect.
        - Updates internal coefficients used in effect computation.

    - **compute_effects():**
        - Calculates the direct, indirect, and total effects using the fitted models.
        - Considers the specified `moderator_value` when computing moderated effects.

    - **bootstrap(n_bootstraps=5000):**
        - Performs bootstrapping to estimate the distribution of the effects.
        - Uses the specified number of bootstrap samples (`n_bootstraps`).
        - Updates internal results with bootstrap estimates.

    - **summary():**
        - Returns a pandas DataFrame summarizing the bootstrap results, including point estimates, confidence intervals, and p-values for the indirect, direct, and total effects.

    **Additional Notes:**

    - **Flexibility:** The class can handle any combination of moderation scenarios, including:
        - No moderation (simple mediation).
        - Moderation at the first stage only.
        - Moderation at the second stage only.
        - Moderation of the direct effect only.
        - Any combination of the above.

    - **Data Requirements:** Ensure that the dataset (`dataframe`) contains all the variables specified and that they are properly formatted (e.g., numeric types for continuous variables).

    - **Statistical Assumptions:** The analysis assumes linear relationships between variables and that the data meet the assumptions of regression analysis (e.g., homoscedasticity, normality of residuals).

    **References:**

    - Edwards, J. R., & Lambert, L. S. (2007). **Methods for integrating moderation and mediation: A general analytical framework using moderated path analysis**. *Psychological Methods*, 12(1), 1–22. DOI: [10.1037/1082-989X.12.1.1](https://doi.org/10.1037/1082-989X.12.1.1)
    """
    def __init__(self, dataframe, iv, mediator, moderator, dv, moderate_stage_1=False, moderate_stage_2=False, moderate_direct=False, moderator_value=None):
        """
        Initializes the MediatedModerationAnalysis class.

        Parameters:
        - dataframe: pandas DataFrame containing the data.
        - iv: str, name of the independent variable (X).
        - mediator: str, name of the mediator variable (M).
        - moderator: str, name of the moderator variable (Z).
        - dv: str, name of the dependent variable (Y).
        - moderate_stage_1: bool, whether to add moderation of IV->M
        - moderate_stage_2:, bool, whether to add moderation of M->DV
        - moderate_direct: bool, whether to add moderation of IV->DV
        - moderator_value: optional, the specific value of Z to test. 
            If None, will perform the Sobel test (Sobel 1982, 1986).
        """
        self.dataframe = dataframe
        self.iv = iv                # Independent Variable (X)
        self.mediator = mediator    # Mediator Variable (M)
        self.moderator = moderator  # Moderator Variable (Z)
        self.dv = dv                # Dependent Variable (Y)
        
        # Moderation Value
        self.z_value = moderator_value
        
        # Moderation Instructions
        self.moderate_stage_1 = moderate_stage_1
        self.moderate_stage_2 = moderate_stage_2
        self.moderate_direct = moderate_direct
        
        # Placeholders for storing results
        self.stage_1_coeffs = None
        self.stage_2_coeffs = None
        self.direct_coeffs = None
        self.indirect_effect = None
        self.direct_effect = None
        self.total_effect = None
        self.bootstrap_results = None

    def fit_first_stage(self):
        """
        Fits the mediator model with first-stage moderation (IV → M).

        Returns:
        - model: Fitted statsmodels OLS regression model.
        """
        # Mediator Model (First-Stage Moderation)
        if self.moderate_stage_1:                                             # Moderate IV->MV
            formula = f"{self.mediator} ~ {self.iv} + {self.moderator} + {self.iv}:{self.moderator}"
            model = ols(formula, data=self.dataframe).fit()
            
            BETA_IV = model.params[self.iv]
            BETA_IVZ = model.params[f"{self.iv}:{self.moderator}"]
            if self.z_value is not None: 
                self.a_coeffs = BETA_IV + BETA_IVZ*self.z_value
            else:
                self.a_coeffs = BETA_IV + BETA_IVZ
        else:                                                                 # Do not moderate IV->MV
            formula = f"{self.mediator} ~ {self.iv}"
            model = ols(formula, data=self.dataframe).fit()
            self.a_coeffs = model.params[self.iv]
        return model

    def fit_outcome_model(self):
        """
        Fits the outcome model

        Returns:
        - model: Fitted statsmodels OLS regression model.
        """
        
        if self.moderate_stage_2 and  self.moderate_direct:                     # Moderate IV->DV and M->DV
            formula = (
            f"{self.dv} ~ {self.iv} + {self.mediator} + {self.moderator} + "
            f"{self.iv}:{self.moderator} + {self.mediator}:{self.moderator}"
            )
            model = ols(formula, data=self.dataframe).fit()
            
            BETA_M = model.params[self.mediator]
            BETA_MZ = model.params[f"{self.mediator}:{self.moderator}"]
            BETA_IV = model.params[self.iv]
            BETA_IVZ = model.params[f"{self.iv}:{self.moderator}"]
            
            if self.z_value is None:
                self.b_coeffs = BETA_M + BETA_MZ
                self.c_prime_coeff = model.params[[self.iv, f"{self.iv}:{self.moderator}"]]
            else: 
                self.b_coeffs = BETA_M + BETA_MZ*self.z_value
                self.c_prime_coeff = BETA_IV + BETA_IVZ*self.z_value
            
        elif self.moderate_stage_2 and not self.moderate_direct:                # Moderate M->DV only
            formula = (
            f"{self.dv} ~ {self.iv} + {self.mediator} + {self.moderator} + "
            f"{self.mediator}:{self.moderator}"
            )
            model = ols(formula, data=self.dataframe).fit()
            
            BETA_M = model.params[self.mediator]
            BETA_MZ = model.params[f"{self.mediator}:{self.moderator}"]
            BETA_IV = model.params[self.iv]
            
            if self.z_value is None:
                self.b_coeffs = BETA_M + BETA_MZ
                self.c_prime_coeff = BETA_IV
            else:
                self.b_coeffs = BETA_M + BETA_MZ*self.z_value
                self.c_prime_coeff = BETA_IV
            
        elif not self.moderate_stage_2 and self.moderate_direct:                # Moderate IV->DV only
            formula = (
            f"{self.dv} ~ {self.iv} + {self.mediator} + {self.moderator} + "
            f"{self.iv}:{self.moderator}"
            )
            model = ols(formula, data=self.dataframe).fit()
            
            BETA_M = model.params[self.mediator]
            BETA_IV = model.params[self.iv]
            BETA_IVZ = model.params[f"{self.iv}:{self.moderator}"]
            
            if self.z_value is None:
                self.b_coeffs = BETA_M
                self.c_prime_coeff = BETA_IV + BETA_IVZ
            else:
                self.b_coeffs = BETA_M
                self.c_prime_coeff = BETA_IV + BETA_IVZ*self.z_value
            
        else:                                                                   # Moderate neither.
            formula = (
            f"{self.dv} ~ {self.iv} + {self.mediator}"
            )
            model = ols(formula, data=self.dataframe).fit()
            
            BETA_M = model.params[self.mediator]
            BETA_IV = model.params[self.iv]
            
            self.b_coeffs = BETA_M
            self.c_prime_coeff = BETA_IV
        return model

    def compute_effects(self):
        """
        Computes the direct, indirect, and total effects at a given level of the moderator.

        Parameters:
        - z_value: float, value of the moderator variable (Z) at which to evaluate the effects.

        Returns:
        - dict containing 'indirect_effect', 'direct_effect', and 'total_effect'.
        """
        # Indirect Effect, Accounting for First Stage (a path) and Second Stage (b path)
        self.indirect_effect = np.sum(self.a_coeffs*self.b_coeffs)
        self.direct_effect = self.a_coeffs
        self.total_effect = self.indirect_effect + self.direct_effect

        return {
            'indirect_effect': self.indirect_effect,
            'direct_effect': self.direct_effect,
            'total_effect': self.total_effect
            }

    def bootstrap(self, n_bootstraps=10000):
        """
        Performs bootstrapping to estimate confidence intervals and p-values
        for the direct, indirect, and total effects.

        Parameters:
        - n_bootstraps: int, number of bootstrap samples (default is 10000).

        Returns:
        - dict containing bootstrap results for 'Indirect Effect', 'Direct Effect', and 'Total Effect'.
        """
        indirect_effects = []
        direct_effects = []
        total_effects = []
        successful_samples = 0

        with tqdm(total=n_bootstraps) as pbar:
            while successful_samples < n_bootstraps:
                # Resample the data with replacement
                sample = self.dataframe.sample(frac=1, replace=True)

                # Temporarily replace the dataframe with the bootstrap sample
                original_dataframe = self.dataframe
                self.dataframe = sample

                # Fit models and compute effects using the existing methods
                first_stage_model = self.fit_first_stage()
                outcome_model = self.fit_outcome_model()
                effects = self.compute_effects()
                
                if self._has_invalid_params():
                    self.dataframe = original_dataframe
                    continue

                # Collect the effects
                indirect_effects.append(effects['indirect_effect'])
                direct_effects.append(effects['direct_effect'])
                total_effects.append(effects['total_effect'])

                successful_samples += 1
                pbar.update(1)

                # Restore the original dataframe
                self.dataframe = original_dataframe

        # Convert to numpy arrays
        indirect_effects = np.array(indirect_effects)
        direct_effects = np.array(direct_effects)
        total_effects = np.array(total_effects)

        # Compute statistics
        results = {}
        for effect_name, effects_array in zip(
            ['Indirect Effect', 'Direct Effect', 'Total Effect'],
            [indirect_effects, direct_effects, total_effects]
        ):
            mean_effect = np.mean(effects_array)
            ci_lower = np.percentile(effects_array, 2.5)
            ci_upper = np.percentile(effects_array, 97.5)
            p_value = np.mean(np.sign(mean_effect) * effects_array <= 0)
            results[effect_name] = {
                'Point Estimate': mean_effect,
                '2.5th Percentile': ci_lower,
                '97.5th Percentile': ci_upper,
                'P-value': p_value
            }

        self.bootstrap_results = results
        return results
    
    def _has_invalid_params(self, debug=False):
        """
        Checks if any of the model parameters or computed effects contain NaNs or infinite values.

        Returns:
        - bool: True if invalid values are present, False otherwise.
        """
        # Check computed effects
        if any([
            np.isnan(self.indirect_effect).any(), np.isinf(self.indirect_effect).any(),
            np.isnan(self.direct_effect).any(), np.isinf(self.direct_effect).any(),
            np.isnan(self.total_effect).any(), np.isinf(self.total_effect).any(),
            ]):
            if debug: 
                print("Indirect effect: ", self.indirect_effect)
                print("Direct effect: ", self.direct_effect)
                print("Total effect: ", self.total_effect)
                print("First Stage Params: ", self.a_coeffs)
                print("Second Stage Params: ", self.b_coeffs)
                print("Direct Params: ", self.c_prime_coeff)
            return True

        return False

    def summary(self):
        """
        Returns a summary DataFrame of the bootstrap results.

        Returns:
        - pandas DataFrame with the bootstrap results.
        """
        if self.bootstrap_results is None:
            print("No bootstrap results available. Please run the bootstrap method first.")
            return
        print("\nIV->MV Effects: \n", self.a_coeffs)
        print("\nMV->DV Effects: \n", self.b_coeffs)
        print("\nIV->DV Effects: \n", self.c_prime_coeff)
        print("Done. Call the .results object to visualize your results.")
        self.results = pd.DataFrame(self.bootstrap_results).T
    
    def run(self):
        self.bootstrap()
        self.summary()


In [None]:
data_df.columns

In [None]:
medmod = MediatedModerationAnalysis(
    dataframe=data_df, 
    iv='Hippocampus_GM_Vol', 
    mediator='Age', 
    moderator='Z_Scored_Subiculum_Connectivity_T', 
    dv='Z_Scored_Percent_Cognitive_Improvement', 
    moderate_stage_1=False, 
    moderate_stage_2=True, 
    moderate_direct=False, 
    moderator_value=2
    )
medmod.run()
display(medmod.results)