Imports

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/Users/cu135/Dropbox (Partners HealthCare)/resources/datasets/BIDS_PD_DBS_STN_WURZBURG/metadata/subject_age_and_atrophy_index.csv')

# 01 - Find Instrumental Variables

In [None]:
import pandas as pd
import scipy.stats as stats
from tqdm import tqdm
import numpy as np

class CorrelationFilter:
    """
    A class for filtering rows of a DataFrame based on correlation p-values for independent and dependent variables.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing variables to be correlated.
        method (function): The correlation method to use (e.g., stats.spearmanr).
        independent_variable (str): The name of the independent variable for filtering.
        dependent_variable (str): The name of the dependent variable for filtering.

    Methods:
        calculate_correlations(): Calculate correlation coefficients and p-values for all variable pairs.
        filter_correlations(): Filter rows based on p-values for the independent and dependent variables.

    Example:
        corr_filter = CorrelationFilter(df, stats.spearmanr, 'Independent_Variable', 'Dependent_Variable')
        filtered_rows = corr_filter.filter_correlations()
    """

    def __init__(self, df, method, independent_variable, dependent_variable):
        """
        Initialize the CorrelationFilter class with the input DataFrame and settings.

        Args:
            df (pd.DataFrame): The input DataFrame containing variables to be correlated.
            method (function): The correlation method to use (e.g., stats.spearmanr).
            independent_variable (str): The name of the independent variable for filtering.
            dependent_variable (str): The name of the dependent variable for filtering.
        """
        self.df = df
        if method == 'pearson':
            self.method = stats.pearsonr
            print('Running pearson correlation. Setting NaN and Inf values to 0')
            self.df = self.df.fillna(0).replace([np.inf, -np.inf], 0)
        elif method == 'spearman':
            self.method = stats.spearmanr
        else:
            raise ValueError(f"Unknown method {method}, choose 'pearsonr' or 'spearman'")
        self.independent_variable = independent_variable
        self.dependent_variable = dependent_variable

    def calculate_correlations(self):
        """
        Calculate correlation coefficients and p-values for all variable pairs in the DataFrame.

        Returns:
            correlation_matrix (pd.DataFrame): A DataFrame containing correlation coefficients.
            p_value_matrix (pd.DataFrame): A DataFrame containing p-values for correlations.
        """
        # Initialize empty correlation and p-value matrices
        num_cols = len(self.df.columns)
        correlation_matrix = pd.DataFrame(index=self.df.columns, columns=self.df.columns)
        p_value_matrix = pd.DataFrame(index=self.df.columns, columns=self.df.columns)

        # Calculate correlation coefficients and p-values
        
        for i in tqdm(range(num_cols)):
            for j in range(i+1, num_cols):
                col1 = self.df.iloc[:, i]
                col2 = self.df.iloc[:, j]
                try:
                    corr, p_value = self.method(col1, col2)
                except:
                    corr = np.nan
                    p_value = np.nan
                correlation_matrix.iloc[i, j] = corr
                correlation_matrix.iloc[j, i] = corr
                p_value_matrix.iloc[i, j] = p_value
                p_value_matrix.iloc[j, i] = p_value

        return correlation_matrix, p_value_matrix

    def filter_correlations(self):
        """
        Filter rows based on p-values for the independent and dependent variables.

        Returns:
            filtered_rows (pd.DataFrame): Rows of the correlation matrix that meet the criteria.
        """
        # Calculate correlations
        correlation_matrix, p_value_matrix = self.calculate_correlations()

        # Filter rows based on p-values for independent and dependent variables
        independent_variable_p_values = p_value_matrix.loc[:, self.independent_variable]
        dependent_variable_p_values = p_value_matrix.loc[:, self.dependent_variable]

        # Rows that meet the criteria
        filtered_p_matrix = p_value_matrix[
            (independent_variable_p_values <= 0.05) & (dependent_variable_p_values > 0.05)
        ]
        
        # Extract the indices of rows and columns to keep
        filtered_indices = filtered_p_matrix.index

        # Use the indices to filter the correlation DataFrame
        filtered_correlation_matrix = correlation_matrix.loc[filtered_indices, [self.independent_variable, self.dependent_variable]]


        return filtered_p_matrix, filtered_correlation_matrix

# Usage example:
# corr_filter = CorrelationFilter(df, stats.spearmanr, 'Independent_Variable', 'Dependent_Variable')
# filtered_rows = corr_filter.filter_correlations()


Display Variables

In [None]:
[col for col in df.columns]

Choose and independent variable and a dependent variable

The independent variable is the thing which causes the dependent variable. 

Thus, in this example, if we are interested in the causal effect of age on the pattern of atrophy in a human brain, define age as the independent variable and some metric of atrophy pattern as the dependent variable.

In [None]:
independent_variable = 'Age'
dependent_variable = 'Atrophy Pattern Index'

How would you like to assess the variables?

correlation methods:

'pearson' - will be more affected by outliers, potentially giving significant results due to outlier. For normal data. 

'spearman' - will be les affected by outliers, potentially mitigating effects of outliers. For non-normal data.

In [None]:
correlation_method = 'pearson'

Find Instrumental Variables

In [None]:
corr_filter = CorrelationFilter(df, correlation_method, independent_variable, dependent_variable)
filtered_p_values_df, filtered_correlation_values_df = corr_filter.filter_correlations()

Display p-Value Results

In [None]:
filtered_p_values_df.loc[:, [dependent_variable, independent_variable]]

In [None]:
filtered_correlation_values_df

# 02 - Run Instrumental Variable Analysis

Define Instrumental Variables in a List

instr_var_list = ['Parietal Atrophy', 'Frontal Atrophy']

instr_var_list = filtered_p_values_df.index.to_list() <--- this will use all possible instrumental variables.

In [None]:
instr_var_list = filtered_p_values_df.index.to_list()

# instr_var_list =  ['Parietal']#, 'Limbic Atrophy'] #, 'Right ACgG anterior cingulate gyrus', 'Left MFC medial frontal cortex', 'Right POrG posterior orbital gyrus', 'Left SCA subcallosal area', 'Left SPL superior parietal lobule', 'Left TTG transverse temporal gyrus']

Define Variables

Run Instrumental Variable Analysis

In [None]:
from statsmodels.sandbox.regression.gmm import IV2SLS

iv_model = IV2SLS(endog=df[dependent_variable], exog=df[independent_variable], instrument=df[instr_var_list])
iv_results = iv_model.fit()

# Display the summary
print(iv_results.summary2())