In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.mediation import Mediation
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols


In [None]:
class Mediation:
    def __init__(self, data_file='cleaning data/participant_combined.csv'):
        self.data = pd.read_csv(data_file, delimiter=',')
        
    def preprocess_data(self):
        self.data.drop(columns=['eid', 'Date_of_attending_assessment_centre', 'X34.0.0', 'Gender', 'Ethnicity', 
                                'Migrant.Status', 'TDI.Tertiles', 'Highest.Qualification', 'House.Ownership', 
                                'Income', 'Cohabiting', 'Living.Alone', 'AUDIT.Score', 'Smoker', 
                                'Moderate.Physical.Activity', 'Longstanding.Illness', 'Diabetes', 'Cancer'], 
                       inplace=True, errors='ignore')
        
        self.data.columns = self.data.columns.str.replace('.', '_', regex=False)
        self.data.columns = self.data.columns.str.replace('(', '', regex=False)
        self.data.columns = self.data.columns.str.replace(')', '', regex=False)
        self.data.columns = self.data.columns.str.replace('-', '', regex=False)
        self.data.columns = self.data.columns.str.replace('Remnant_Cholesterol_NonHDL,_NonLDL_Cholesterol', 
                                                          'Remnant_Cholesterol_NonHDL_NonLDL_Cholesterol')
        
        self.depression_atr = [
            'Depressed_At_Baseline', 'Loneliness', 'Social_Isolation', 'PHQ9_No_Info', 
            'PHQ9_Screen', 'PHQ9_Items', 'PHQ9_Severity', 'CIDI_MDD_No_Info', 
            'CIDI_MDD_Screen', 'CIDI_MDD_Response', 'CIDI_MDD_Severity', 'GAD_CIDI_Somatic'
        ]
        self.CVD = 'CVD'
        self.NMR_atr = [
            'Cholesterol_in_Chylomicrons_and_Extremely_Large_VLDL', 'Cholesterol_in_IDL', 'Cholesterol_in_Large_HDL', 'Cholesterol_in_Large_LDL', 'Cholesterol_in_Large_VLDL', 'Cholesterol_in_Medium_HDL', 'Cholesterol_in_Medium_LDL', 'Cholesterol_in_Medium_VLDL', 'Cholesterol_in_Small_HDL', 'Cholesterol_in_Small_LDL', 'Cholesterol_in_Small_VLDL', 'Cholesterol_in_Very_Large_HDL', 'Cholesterol_in_Very_Large_VLDL', 'Cholesterol_in_Very_Small_VLDL', 'Clinical_LDL_Cholesterol', 'HDL_Cholesterol', 'LDL_Cholesterol', 'Remnant_Cholesterol_NonHDL_NonLDL_Cholesterol', 'Total_Cholesterol', 'Total_Cholesterol_Minus_HDLC', 'Total_Concentration_of_Lipoprotein_Particles', 'Total_Esterified_Cholesterol', 'VLDL_Cholesterol',
            'Cholesteryl_Esters_in_Chylomicrons_and_Extremely_Large_VLDL', 'Cholesteryl_Esters_in_HDL', 'Cholesteryl_Esters_in_IDL', 'Cholesteryl_Esters_in_Large_HDL', 'Cholesteryl_Esters_in_Large_LDL', 'Cholesteryl_Esters_in_Large_VLDL', 'Cholesteryl_Esters_in_LDL', 'Cholesteryl_Esters_in_Medium_HDL', 'Cholesteryl_Esters_in_Medium_LDL', 'Cholesteryl_Esters_in_Medium_VLDL', 'Cholesteryl_Esters_in_Small_HDL', 'Cholesteryl_Esters_in_Small_LDL', 'Cholesteryl_Esters_in_Small_VLDL', 'Cholesteryl_Esters_in_Very_Large_HDL', 'Cholesteryl_Esters_in_Very_Large_VLDL', 'Cholesteryl_Esters_in_Very_Small_VLDL', 'Cholesteryl_Esters_in_VLDL',
            'Concentration_of_Chylomicrons_and_Extremely_Large_VLDL_Particles', 'Concentration_of_HDL_Particles', 'Concentration_of_IDL_Particles', 'Concentration_of_Large_HDL_Particles', 'Concentration_of_Large_LDL_Particles', 'Concentration_of_Large_VLDL_Particles', 'Concentration_of_LDL_Particles', 'Concentration_of_Medium_HDL_Particles', 'Concentration_of_Medium_LDL_Particles', 'Concentration_of_Medium_VLDL_Particles', 'Concentration_of_Small_HDL_Particles', 'Concentration_of_Small_LDL_Particles', 'Concentration_of_Small_VLDL_Particles', 'Concentration_of_Very_Large_HDL_Particles', 'Concentration_of_Very_Large_VLDL_Particles', 'Concentration_of_Very_Small_VLDL_Particles', 'Concentration_of_VLDL_Particles', 
            'Free_Cholesterol_in_Chylomicrons_and_Extremely_Large_VLDL', 'Free_Cholesterol_in_HDL', 'Free_Cholesterol_in_IDL', 'Free_Cholesterol_in_Large_HDL', 'Free_Cholesterol_in_Large_LDL', 'Free_Cholesterol_in_Large_VLDL', 'Free_Cholesterol_in_LDL', 'Free_Cholesterol_in_Medium_HDL', 'Free_Cholesterol_in_Medium_LDL', 'Free_Cholesterol_in_Medium_VLDL', 'Free_Cholesterol_in_Small_HDL', 'Free_Cholesterol_in_Small_LDL', 'Free_Cholesterol_in_Small_VLDL', 'Free_Cholesterol_in_Very_Large_HDL', 'Free_Cholesterol_in_Very_Large_VLDL', 'Free_Cholesterol_in_Very_Small_VLDL', 'Free_Cholesterol_in_VLDL', 'Total_Free_Cholesterol',
            'Phospholipids_in_Chylomicrons_and_Extremely_Large_VLDL', 'Phospholipids_in_HDL', 'Phospholipids_in_IDL', 'Phospholipids_in_Large_HDL', 'Phospholipids_in_Large_LDL', 'Phospholipids_in_Large_VLDL', 'Phospholipids_in_LDL', 'Phospholipids_in_Medium_HDL', 'Phospholipids_in_Medium_LDL', 'Phospholipids_in_Medium_VLDL', 'Phospholipids_in_Small_HDL', 'Phospholipids_in_Small_LDL', 'Phospholipids_in_Small_VLDL', 'Phospholipids_in_Very_Large_HDL', 'Phospholipids_in_Very_Large_VLDL', 'Phospholipids_in_Very_Small_VLDL', 'Phospholipids_in_VLDL', 'Total_Phospholipids_in_Lipoprotein_Particles',
            'Total_Lipids_in_Chylomicrons_and_Extremely_Large_VLDL', 'Total_Lipids_in_HDL', 'Total_Lipids_in_IDL', 'Total_Lipids_in_Large_HDL', 'Total_Lipids_in_Large_LDL', 'Total_Lipids_in_Large_VLDL', 'Total_Lipids_in_LDL', 'Total_Lipids_in_Lipoprotein_Particles', 'Total_Lipids_in_Medium_HDL', 'Total_Lipids_in_Medium_LDL', 'Total_Lipids_in_Medium_VLDL', 'Total_Lipids_in_Small_HDL', 'Total_Lipids_in_Small_LDL', 'Total_Lipids_in_Small_VLDL', 'Total_Lipids_in_Very_Large_HDL', 'Total_Lipids_in_Very_Large_VLDL', 'Total_Lipids_in_Very_Small_VLDL', 'Total_Lipids_in_VLDL',
            'Total_Triglycerides', 'Triglycerides_in_Chylomicrons_and_Extremely_Large_VLDL', 'Triglycerides_in_HDL', 'Triglycerides_in_IDL', 'Triglycerides_in_Large_HDL', 'Triglycerides_in_Large_LDL', 'Triglycerides_in_Large_VLDL', 'Triglycerides_in_LDL', 'Triglycerides_in_Medium_HDL', 'Triglycerides_in_Medium_LDL', 'Triglycerides_in_Medium_VLDL', 'Triglycerides_in_Small_HDL', 'Triglycerides_in_Small_LDL', 'Triglycerides_in_Small_VLDL', 'Triglycerides_in_Very_Large_HDL', 'Triglycerides_in_Very_Large_VLDL', 'Triglycerides_in_Very_Small_VLDL', 'Triglycerides_in_VLDL',
            'Apolipoprotein_A1', 'Apolipoprotein_B', 'Average_Diameter_for_HDL_Particles', 'Average_Diameter_for_LDL_Particles', 'Average_Diameter_for_VLDL_Particles', 'Glycoprotein_Acetyls'
        ]

        scaler = StandardScaler()
        self.data[self.depression_atr + self.NMR_atr + [self.CVD]] = scaler.fit_transform(self.data[self.depression_atr + self.NMR_atr + [self.CVD]])
    
    def conduct_mediation_analysis(self, output_file='mediation_results.csv'):
        mediation_results = {}
        
        for mediator in self.NMR_atr:
            try:
                # only ind
                model_1 = ols(f'{self.CVD} ~ {" + ".join(self.depression_atr)}', data=self.data).fit()
                
                # mediator as dep
                mediator_model = ols(f'{mediator} ~ {" + ".join(self.depression_atr)}', data=self.data).fit()
                
                # both ind and mediator
                direct_model = ols(f'{self.CVD} ~ {" + ".join(self.depression_atr)} + {mediator}', data=self.data).fit()
                
                a = mediator_model.params[1:]  # Coeff depr predicting mediator
                b = direct_model.params[mediator]  # Coeff mediator predicting outcome
                c_prime = direct_model.params[1:len(self.depression_atr)+1]  # Coeff depr inpresence of mediator
                c = model_1.params[1:]  # Coeff depr w/out mediator

                indirect_effect = a * b
                total_effect = c
                direct_effect = c_prime
                
                mediation_results[mediator] = {
                    'indirect_effect': indirect_effect,
                    'direct_effect': direct_effect,
                    'total_effect': total_effect
                }
            except Exception as e:
                mediation_results[mediator] = f"Error: {e}"

        results_list = []
        
        for mediator, result in mediation_results.items():
            if isinstance(result, dict):
                for effect_type, effect_values in result.items():
                    if isinstance(effect_values, pd.Series):
                        for index, value in effect_values.items():
                            results_list.append([mediator, effect_type, index, value])
                    else:
                        results_list.append([mediator, effect_type, None, effect_values])
            else:
                results_list.append([mediator, 'error', None, result])

        results_df = pd.DataFrame(results_list, columns=['Mediator', 'Effect_Type', 'Index', 'Value'])
        results_df.to_csv(output_file, index=False)
        print(f"Mediation results saved to {output_file}")



In [None]:
if __name__ == "__main__":
    mediator_analysis = Mediation()
    mediator_analysis.preprocess_data()
    mediator_analysis.conduct_mediation_analysis()

Top Mediators

In [6]:
if __name__ == "__main__":
    mediator_analysis = Mediation()
    mediator_analysis.preprocess_data()
    mediator_analysis.conduct_mediation_analysis()
    
    indirect_effects_magnitude = []
    for mediator, result in mediator_analysis.mediation_results.items():
        if isinstance(result, dict):
            indirect_effect = result['indirect_effect']
            magnitude = sum(indirect_effect.abs())
            indirect_effects_magnitude.append((mediator, magnitude))
    
    # Top 10 mediators
    indirect_effects_magnitude.sort(key=lambda x: x[1], reverse=True)
    top_10_mediators = indirect_effects_magnitude[:10]
    top_10_df = pd.DataFrame(top_10_mediators, columns=['Mediator', 'Indirect_Effect_Magnitude'])
    top_10_df.to_csv('top_10_mediators.csv', index=False)
    
    print("Top 10 mediators saved to 'top_10_mediators.csv'")