In [1]:
import pandas as pd
import glob
import numpy as np

In [2]:
import numpy as np
import pandas as pd

def clean_NMR(df_NMR, threshold=5):
    # Removing the first column (likely an index or unnecessary column)
    df_NMR = df_NMR.iloc[:, 1:]

    # Extract and keep the 'eid' column
    eid_col = df_NMR.iloc[:, 0]
    df_NMR = df_NMR.iloc[:, 1:]

    # Remove columns related to visits 2, 3, and 4
    words_to_remove = ['Visit2', 'Visit3', 'Visit4']
    columns_to_keep = ~df_NMR.columns.str.contains('|'.join(words_to_remove))
    df_NMR = df_NMR.loc[:, columns_to_keep]

    # Drop the specific 'Date_of_attending_assessment_centre_Visit1_0' column
    df_NMR = df_NMR.drop(columns=['Date_of_attending_assessment_centre_Visit1_0'])

    print(df_NMR.shape)
    print("Missing values per column before imputation:", df_NMR.isna().sum())

    # Mean imputation for missing values
    df_NMR = df_NMR.fillna(df_NMR.mean())
    print("Missing values per column after imputation:", df_NMR.isna().sum())

    # Filter out values greater than threshold * standard deviation
    df_NMR_filt = df_NMR.copy()
    for col in df_NMR.columns:
        mean = df_NMR[col].mean()
        std = df_NMR[col].std()
        df_NMR_filt.loc[abs(df_NMR[col] - mean) > threshold * std, col] = np.nan
    
    # Mean imputation again for new NaN values after filtering
    df_NMR_filt = df_NMR_filt.fillna(df_NMR_filt.mean())
    print("Missing values per column after filtering and imputation:", df_NMR_filt.isna().sum())

    # Add 1 to all values
    df_NMR_filt = df_NMR_filt.add(1, axis='index')

    # Log transformation
    df_NMR_filt = df_NMR_filt.apply(lambda x: np.log(x))

    # Normalize the data
    df_NMR_filt = df_NMR_filt.apply(lambda x: (x - x.mean()) / x.std())

    # Remove '_Visit1_0' from column names
    df_NMR_filt.columns = df_NMR_filt.columns.str.replace('_Visit1_0', '')

    # Add back the 'eid' column
    df_NMR_filt = pd.concat([eid_col, df_NMR_filt], axis=1)

    # Save the cleaned DataFrame to a CSV file
    df_NMR_filt.to_csv('NMR_cleaned.csv', index=False)
    
    return df_NMR_filt


In [3]:

# Clean NMR data
df_NMR = pd.read_csv('NMR_metabolic_biomarkers_with_dates.csv', delimiter=',')
df_NMR_cleaned = clean_NMR(df_NMR)
df_NMR_cleaned.head()

# about 3.5 min to run

(502379, 168)
Missing values per column before imputation: Total_Cholesterol_Visit1_0                                      384370
Total_Cholesterol_Minus_HDL-C_Visit1_0                          384370
Remnant_Cholesterol_(Non-HDL,_Non-LDL_-Cholesterol)_Visit1_0    384370
VLDL_Cholesterol_Visit1_0                                       384370
Clinical_LDL_Cholesterol_Visit1_0                               384370
                                                                 ...  
Phospholipids_in_Small_HDL_Visit1_0                             384370
Cholesterol_in_Small_HDL_Visit1_0                               384370
Cholesteryl_Esters_in_Small_HDL_Visit1_0                        384370
Free_Cholesterol_in_Small_HDL_Visit1_0                          384370
Triglycerides_in_Small_HDL_Visit1_0                             384370
Length: 168, dtype: int64
Missing values per column after imputation: Total_Cholesterol_Visit1_0                                      0
Total_Cholesterol_Minus_

Unnamed: 0,eid,Total_Cholesterol,Total_Cholesterol_Minus_HDL-C,"Remnant_Cholesterol_(Non-HDL,_Non-LDL_-Cholesterol)",VLDL_Cholesterol,Clinical_LDL_Cholesterol,LDL_Cholesterol,HDL_Cholesterol,Total_Triglycerides,Triglycerides_in_VLDL,...,Cholesteryl_Esters_in_Medium_HDL,Free_Cholesterol_in_Medium_HDL,Triglycerides_in_Medium_HDL,Concentration_of_Small_HDL_Particles,Total_Lipids_in_Small_HDL,Phospholipids_in_Small_HDL,Cholesterol_in_Small_HDL,Cholesteryl_Esters_in_Small_HDL,Free_Cholesterol_in_Small_HDL,Triglycerides_in_Small_HDL
0,1000013,0.051916,0.063921,0.057976,0.058055,0.063937,0.054346,0.063156,0.09659,0.096283,...,0.040598,0.033066,0.04344,0.016731,0.037537,0.036823,0.026102,0.022196,0.023505,0.037243
1,1000024,0.051916,0.063921,0.057976,0.058055,0.063937,0.054346,0.063156,0.09659,0.096283,...,0.040598,0.033066,0.04344,0.016731,0.037537,0.036823,0.026102,0.022196,0.023505,0.037243
2,1000036,0.051916,0.063921,0.057976,0.058055,0.063937,0.054346,0.063156,0.09659,0.096283,...,0.040598,0.033066,0.04344,0.016731,0.037537,0.036823,0.026102,0.022196,0.023505,0.037243
3,1000048,0.051916,0.063921,0.057976,0.058055,0.063937,0.054346,0.063156,0.09659,0.096283,...,0.040598,0.033066,0.04344,0.016731,0.037537,0.036823,0.026102,0.022196,0.023505,0.037243
4,1000055,0.051916,0.063921,0.057976,0.058055,0.063937,0.054346,0.063156,0.09659,0.096283,...,0.040598,0.033066,0.04344,0.016731,0.037537,0.036823,0.026102,0.022196,0.023505,0.037243


In [4]:
df_NMR_cleaned.columns.to_list()

['eid',
 'Total_Cholesterol',
 'Total_Cholesterol_Minus_HDL-C',
 'Remnant_Cholesterol_(Non-HDL,_Non-LDL_-Cholesterol)',
 'VLDL_Cholesterol',
 'Clinical_LDL_Cholesterol',
 'LDL_Cholesterol',
 'HDL_Cholesterol',
 'Total_Triglycerides',
 'Triglycerides_in_VLDL',
 'Triglycerides_in_LDL',
 'Triglycerides_in_HDL',
 'Total_Phospholipids_in_Lipoprotein_Particles',
 'Phospholipids_in_VLDL',
 'Phospholipids_in_LDL',
 'Phospholipids_in_HDL',
 'Total_Esterified_Cholesterol',
 'Cholesteryl_Esters_in_VLDL',
 'Cholesteryl_Esters_in_LDL',
 'Cholesteryl_Esters_in_HDL',
 'Total_Free_Cholesterol',
 'Free_Cholesterol_in_VLDL',
 'Free_Cholesterol_in_LDL',
 'Free_Cholesterol_in_HDL',
 'Total_Lipids_in_Lipoprotein_Particles',
 'Total_Lipids_in_VLDL',
 'Total_Lipids_in_LDL',
 'Total_Lipids_in_HDL',
 'Total_Concentration_of_Lipoprotein_Particles',
 'Concentration_of_VLDL_Particles',
 'Concentration_of_LDL_Particles',
 'Concentration_of_HDL_Particles',
 'Average_Diameter_for_VLDL_Particles',
 'Average_Diameter_

In [5]:
import pandas as pd

# Assuming df is your DataFrame
columns_to_select = ['eid', 'Glucose', 'Omega-3_Fatty_Acids', 'Citrate', 'Creatinine', 'Albumin']
selected_df = df_NMR_cleaned[columns_to_select]
# or
selected_df = df_NMR_cleaned.loc[:, columns_to_select]
selected_df

Unnamed: 0,eid,Glucose,Omega-3_Fatty_Acids,Citrate,Creatinine,Albumin
0,1000013,0.109653,0.074135,0.027833,0.029712,0.021496
1,1000024,0.109653,0.074135,0.027833,0.029712,0.021496
2,1000036,0.109653,0.074135,0.027833,0.029712,0.021496
3,1000048,0.109653,0.074135,0.027833,0.029712,0.021496
4,1000055,0.109653,0.074135,0.027833,0.029712,0.021496
...,...,...,...,...,...,...
502374,6024888,0.755935,2.219001,-0.295852,2.390898,4.454125
502375,6024892,0.109653,0.074135,0.027833,0.029712,0.021496
502376,6024903,0.109653,0.074135,0.027833,0.029712,0.021496
502377,6024910,0.109653,0.074135,0.027833,0.029712,0.021496


In [6]:
df_combined = pd.read_csv('df_combined.csv')
selected_df2 = df_combined.rename(columns={'ID': 'eid'})

In [7]:
import pandas as pd

# Assuming df_combined is your DataFrame
columns_to_select = [
    'eid',
    'Age.At.MHQ.1',
    'Gender_x',
    'Body mass index (BMI)',
    'Smoking status',
    'Moderate.Physical.Activity_x',
    'Diabetes_x', 
    'Ever thought that life not worth living',
    'Ever had prolonged loss of interest in normal activities',
    'Recent lack of interest or pleasure in doing things',
    'Recent poor appetite or overeating',
    'Trouble.sleeping',
    'Sleeping.change',
    'Trouble falling or staying asleep, or sleeping too much',
    'Sleep duration',
    'Sleeplessness / insomnia'
]


selected_df2_ = selected_df2.loc[:, columns_to_select]


selected_df2_

Unnamed: 0,eid,Age.At.MHQ.1,Gender_x,Body mass index (BMI),Smoking status,Moderate.Physical.Activity_x,Diabetes_x,Ever thought that life not worth living,Ever had prolonged loss of interest in normal activities,Recent lack of interest or pleasure in doing things,Recent poor appetite or overeating,Trouble.sleeping,Sleeping.change,"Trouble falling or staying asleep, or sleeping too much",Sleep duration,Sleeplessness / insomnia
0,1000036,59,0,31.4,0.0,0.0,0.0,0,1,1,1,2.0,1.0,2,7.0,2.0
1,1000123,61,1,22.9,0.0,1.0,0.0,0,0,1,1,1.0,,1,6.0,3.0
2,1000192,61,0,22.3,0.0,1.0,0.0,0,0,1,1,1.0,0.0,1,8.0,2.0
3,1000219,57,0,22.7,0.0,,0.0,2,1,1,1,4.0,1.0,4,8.0,3.0
4,1000225,67,0,26.7,1.0,1.0,0.0,1,1,1,1,1.0,1.0,1,6.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157281,6024751,79,0,26.9,2.0,1.0,0.0,0,0,2,1,2.0,,2,7.0,2.0
157282,6024764,72,0,26.1,1.0,1.0,0.0,1,0,2,2,3.0,1.0,3,9.0,1.0
157283,6024786,70,1,26.6,0.0,1.0,0.0,0,0,1,1,2.0,,2,7.0,2.0
157284,6024798,59,1,28.7,1.0,1.0,0.0,0,1,1,1,1.0,0.0,1,6.0,2.0


In [8]:
combined_data = pd.merge(selected_df2_, selected_df, on='eid', how='inner')
combined_data.columns

Index(['eid', 'Age.At.MHQ.1', 'Gender_x', 'Body mass index (BMI)',
       'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x',
       'Ever thought that life not worth living',
       'Ever had prolonged loss of interest in normal activities',
       'Recent lack of interest or pleasure in doing things',
       'Recent poor appetite or overeating', 'Trouble.sleeping',
       'Sleeping.change',
       'Trouble falling or staying asleep, or sleeping too much',
       'Sleep duration', 'Sleeplessness / insomnia', 'Glucose',
       'Omega-3_Fatty_Acids', 'Citrate', 'Creatinine', 'Albumin'],
      dtype='object')

In [10]:
df_imputed = combined_data.fillna(combined_data.mean())

In [11]:
import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()

    # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Return summary statistics
    return pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia'))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating'))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Trouble.sleeping'))

# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table


Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,-0.007585,0.005341,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much",-0.000171,0.01003,True
2,Glucose,Sleeplessness / insomnia,0.019898,0.0,True
3,Creatinine,Recent lack of interest or pleasure in doing things,0.000142,0.014077,True
4,Creatinine,Sleeplessness / insomnia,-0.036854,0.0,True
5,Creatinine,Sleeping.change,-0.069264,0.0,True
6,Creatinine,Sleeplessness / insomnia,-0.036854,0.0,True
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,3.2e-05,0.68492,False
8,Citrate,Trouble.sleeping,-0.003167,0.249684,False


In [20]:
import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_var=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_var (str, optional): The name of the covariate variable column. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariate if provided
    if covariate_var:
        X[covariate_var] = df[covariate_var]

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()

    # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [covariate_var] if covariate_var else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })

    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', 'Age.At.MHQ.1'))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Trouble.sleeping', 'Age.At.MHQ.1'))

# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,Age.At.MHQ.1,-0.011669,1.9e-05,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much",Age.At.MHQ.1,-0.000123,0.063818,False
2,Glucose,Sleeplessness / insomnia,Age.At.MHQ.1,0.012014,0.00054,True
3,Creatinine,Recent lack of interest or pleasure in doing things,Age.At.MHQ.1,0.000171,0.003217,True
4,Creatinine,Sleeplessness / insomnia,Age.At.MHQ.1,-0.044689,0.0,True
5,Creatinine,Sleeping.change,Age.At.MHQ.1,-0.057493,0.0,True
6,Creatinine,Sleeplessness / insomnia,Age.At.MHQ.1,-0.044689,0.0,True
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,Age.At.MHQ.1,8.7e-05,0.272647,False
8,Citrate,Trouble.sleeping,Age.At.MHQ.1,0.002333,0.39716,False


In [19]:
import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_var=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_var (str, optional): The name of the covariate variable column. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariate if provided
    if covariate_var:
        X[covariate_var] = df[covariate_var]

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()

    # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [covariate_var] if covariate_var else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })

    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', 'Gender_x'))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Trouble.sleeping', 'Gender_x'))

# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[covariate_var] = df[covariate_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,Gender_x,-0.004443,0.105012,False
1,Glucose,"Trouble falling or staying asleep, or sleeping too much",Gender_x,-0.000172,0.009528,True
2,Glucose,Sleeplessness / insomnia,Gender_x,0.01732,1e-06,True
3,Creatinine,Recent lack of interest or pleasure in doing things,Gender_x,7e-05,0.212047,False
4,Creatinine,Sleeplessness / insomnia,Gender_x,0.00822,0.013701,True
5,Creatinine,Sleeping.change,Gender_x,-0.001292,0.881207,False
6,Creatinine,Sleeplessness / insomnia,Gender_x,0.00822,0.013701,True
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,Gender_x,5.1e-05,0.520802,False
8,Citrate,Trouble.sleeping,Gender_x,-0.011144,5.6e-05,True


In [14]:
import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_vars=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_vars (list of str, optional): The names of the covariate variable columns. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariates if provided
    if covariate_vars:
        X = pd.concat([X] + [df[cov] for cov in covariate_vars], axis=1)

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()
    
    
      # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [", ".join(covariate_vars)] if covariate_vars else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })


    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', ['Age.At.MHQ.1', 'Gender_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Ever thought that life not worth living', ['Age.At.MHQ.1', 'Gender_x']))




# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,"Age.At.MHQ.1, Gender_x",-0.008298,0.002506,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much","Age.At.MHQ.1, Gender_x",-0.000123,0.063124,False
2,Glucose,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x",0.008304,0.017789,True
3,Creatinine,Recent lack of interest or pleasure in doing things,"Age.At.MHQ.1, Gender_x",9e-05,0.108294,False
4,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x",0.002619,0.434379,False
5,Creatinine,Sleeping.change,"Age.At.MHQ.1, Gender_x",0.006195,0.47408,False
6,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x",0.002619,0.434379,False
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,"Age.At.MHQ.1, Gender_x",0.000111,0.161596,False
8,Citrate,Ever thought that life not worth living,"Age.At.MHQ.1, Gender_x",-1e-05,0.780379,False


In [16]:
#smoking status

import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_vars=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_vars (list of str, optional): The names of the covariate variable columns. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariates if provided
    if covariate_vars:
        X = pd.concat([X] + [df[cov] for cov in covariate_vars], axis=1)

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()
    
    
      # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [", ".join(covariate_vars)] if covariate_vars else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })


    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Ever thought that life not worth living', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status']))




# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,"Age.At.MHQ.1, Gender_x, Smoking status",-0.007726,0.004951,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much","Age.At.MHQ.1, Gender_x, Smoking status",-0.000122,0.064686,False
2,Glucose,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status",0.00854,0.014866,True
3,Creatinine,Recent lack of interest or pleasure in doing things,"Age.At.MHQ.1, Gender_x, Smoking status",9e-05,0.108187,False
4,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status",0.003018,0.368019,False
5,Creatinine,Sleeping.change,"Age.At.MHQ.1, Gender_x, Smoking status",0.006725,0.437196,False
6,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status",0.003018,0.368019,False
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,"Age.At.MHQ.1, Gender_x, Smoking status",0.000112,0.157879,False
8,Citrate,Ever thought that life not worth living,"Age.At.MHQ.1, Gender_x, Smoking status",-8e-06,0.828438,False


In [17]:




import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_vars=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_vars (list of str, optional): The names of the covariate variable columns. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariates if provided
    if covariate_vars:
        X = pd.concat([X] + [df[cov] for cov in covariate_vars], axis=1)

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()
    
    
      # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [", ".join(covariate_vars)] if covariate_vars else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })


    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Ever thought that life not worth living', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x']))




# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",-0.00726,0.008325,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much","Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",-0.000123,0.063958,False
2,Glucose,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",0.008193,0.019516,True
3,Creatinine,Recent lack of interest or pleasure in doing things,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",9e-05,0.10696,False
4,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",0.002675,0.425181,False
5,Creatinine,Sleeping.change,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",0.00681,0.431424,False
6,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",0.002675,0.425181,False
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",0.000112,0.157809,False
8,Citrate,Ever thought that life not worth living,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x",-8e-06,0.819301,False


In [18]:


import pandas as pd
import statsmodels.api as sm

def perform_ols_regression(df, dependent_var, independent_var, covariate_vars=None):
    """
    Perform OLS regression and return a summary table with dependent variable, 
    independent variable, coefficient, and p-value.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    dependent_var (str): The name of the dependent variable column.
    independent_var (str): The name of the independent variable column.
    covariate_vars (list of str, optional): The names of the covariate variable columns. Default is None.

    Returns:
    DataFrame: A DataFrame containing the regression summary.
    """
    # Define the dependent variable
    y = df[dependent_var]

    # Define the independent variable
    X = df[[independent_var]]

    # Add covariates if provided
    if covariate_vars:
        X = pd.concat([X] + [df[cov] for cov in covariate_vars], axis=1)

    # Add a constant to the independent variables (for the intercept term)
    X = sm.add_constant(X)

    # Fit the OLS model
    model = sm.OLS(y, X).fit()
    
    
      # Extract the coefficient and p-value
    coefficient = model.params[independent_var]
    p_value = model.pvalues[independent_var]

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Dependent Variable': [dependent_var],
        'Independent Variable': [independent_var],
        'Covariate': [", ".join(covariate_vars)] if covariate_vars else ['None'],
        'Coefficient': [coefficient],
        'P-value': [p_value],
        'P-value < 0.05': [p_value < 0.05]  # Highlight if p-value is less than 0.05
    })


    return summary_df

# Example usage:
summary_tables = []
summary_tables.append(perform_ols_regression(df_imputed, 'Albumin', 'Trouble.sleeping', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Trouble falling or staying asleep, or sleeping too much', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Glucose', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Recent lack of interest or pleasure in doing things', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeping.change', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Creatinine', 'Sleeplessness / insomnia', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Omega-3_Fatty_Acids', 'Recent poor appetite or overeating', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))
summary_tables.append(perform_ols_regression(df_imputed, 'Citrate', 'Ever thought that life not worth living', ['Age.At.MHQ.1', 'Gender_x', 'Smoking status', 'Moderate.Physical.Activity_x', 'Diabetes_x']))




# Concatenate all summary tables
combined_summary_table = pd.concat(summary_tables, ignore_index=True)

# Apply styling
def highlight_p_value(s):
    if s['P-value'] < 0.05:
        return ['background-color: lightgreen' for _ in s]
    return ['' for _ in s]

combined_summary_table = combined_summary_table.style.apply(highlight_p_value, axis=1)

# Display the styled combined summary table
combined_summary_table

Unnamed: 0,Dependent Variable,Independent Variable,Covariate,Coefficient,P-value,P-value < 0.05
0,Albumin,Trouble.sleeping,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",-0.006983,0.011184,True
1,Glucose,"Trouble falling or staying asleep, or sleeping too much","Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",-0.000125,0.058541,False
2,Glucose,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",0.006435,0.066226,False
3,Creatinine,Recent lack of interest or pleasure in doing things,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",9e-05,0.106824,False
4,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",0.002745,0.413257,False
5,Creatinine,Sleeping.change,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",0.006857,0.428224,False
6,Creatinine,Sleeplessness / insomnia,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",0.002745,0.413257,False
7,Omega-3_Fatty_Acids,Recent poor appetite or overeating,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",0.000112,0.156666,False
8,Citrate,Ever thought that life not worth living,"Age.At.MHQ.1, Gender_x, Smoking status, Moderate.Physical.Activity_x, Diabetes_x",-7e-06,0.85692,False
